132 files changed, 5152 insertions, 1599 deletions
diff --git a/lib/Target/AArch64/AArch64CondBrTuning.cpp b/lib/Target/AArch64/AArch64CondBrTuning.cpp
index f27bc97ec3f3..0a948812ff33 100644
--- a/lib/Target/AArch64/AArch64CondBrTuning.cpp
+++ b/lib/Target/AArch64/AArch64CondBrTuning.cpp
@@ -22,7 +22,7 @@
 ///    cbz w8, .LBB1_2 -> b.eq .LBB1_2
 ///
 /// 3) sub w8, w0, w1       -> subs w8, w0, w1   ; w8 has multiple uses.
-///    tbz w8, #31, .LBB6_2 -> b.ge .LBB6_2
+///    tbz w8, #31, .LBB6_2 -> b.pl .LBB6_2
 ///
 //===----------------------------------------------------------------------===//
 
@@ -129,11 +129,11 @@ MachineInstr *AArch64CondBrTuning::convertToCondBr(MachineInstr &MI) {
     break;
   case AArch64::TBZW:
   case AArch64::TBZX:
-    CC = AArch64CC::GE;
+    CC = AArch64CC::PL;
     break;
   case AArch64::TBNZW:
   case AArch64::TBNZX:
-    CC = AArch64CC::LT;
+    CC = AArch64CC::MI;
     break;
   }
   return BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AArch64::Bcc))
@@ -271,6 +271,7 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI,
     }
     break;
   }
+  (void)NewCmp; (void)NewBr;
   assert(NewCmp && NewBr && "Expected new instructions.");
 
   DEBUG(dbgs() << "  with instruction:\n    ");
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 00a0111f2bd2..9eda56c825a9 100644
--- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -139,6 +140,7 @@ class SSACCmpConv {
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
+  const MachineBranchProbabilityInfo *MBPI;
 
 public:
   /// The first block containing a conditional branch, dominating everything
@@ -186,8 +188,10 @@ private:
 
 public:
   /// runOnMachineFunction - Initialize per-function data structures.
-  void runOnMachineFunction(MachineFunction &MF) {
+  void runOnMachineFunction(MachineFunction &MF,
+                            const MachineBranchProbabilityInfo *MBPI) {
     this->MF = &MF;
+    this->MBPI = MBPI;
     TII = MF.getSubtarget().getInstrInfo();
     TRI = MF.getSubtarget().getRegisterInfo();
     MRI = &MF.getRegInfo();
@@ -564,8 +568,40 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
   // All CmpBB instructions are moved into Head, and CmpBB is deleted.
   // Update the CFG first.
   updateTailPHIs();
-  Head->removeSuccessor(CmpBB, true);
-  CmpBB->removeSuccessor(Tail, true);
+
+  // Save successor probabilties before removing CmpBB and Tail from their
+  // parents.
+  BranchProbability Head2CmpBB = MBPI->getEdgeProbability(Head, CmpBB);
+  BranchProbability CmpBB2Tail = MBPI->getEdgeProbability(CmpBB, Tail);
+
+  Head->removeSuccessor(CmpBB);
+  CmpBB->removeSuccessor(Tail);
+
+  // If Head and CmpBB had successor probabilties, udpate the probabilities to
+  // reflect the ccmp-conversion.
+  if (Head->hasSuccessorProbabilities() && CmpBB->hasSuccessorProbabilities()) {
+
+    // Head is allowed two successors. We've removed CmpBB, so the remaining
+    // successor is Tail. We need to increase the successor probability for
+    // Tail to account for the CmpBB path we removed.
+    //
+    // Pr(Tail|Head) += Pr(CmpBB|Head) * Pr(Tail|CmpBB).
+    assert(*Head->succ_begin() == Tail && "Head successor is not Tail");
+    BranchProbability Head2Tail = MBPI->getEdgeProbability(Head, Tail);
+    Head->setSuccProbability(Head->succ_begin(),
+                             Head2Tail + Head2CmpBB * CmpBB2Tail);
+
+    // We will transfer successors of CmpBB to Head in a moment without
+    // normalizing the successor probabilities. Set the successor probabilites
+    // before doing so.
+    //
+    // Pr(I|Head) = Pr(CmpBB|Head) * Pr(I|CmpBB).
+    for (auto I = CmpBB->succ_begin(), E = CmpBB->succ_end(); I != E; ++I) {
+      BranchProbability CmpBB2I = MBPI->getEdgeProbability(CmpBB, *I);
+      CmpBB->setSuccProbability(I, Head2CmpBB * CmpBB2I);
+    }
+  }
+
   Head->transferSuccessorsAndUpdatePHIs(CmpBB);
   DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc();
   TII->removeBranch(*Head);
@@ -717,6 +753,7 @@ int SSACCmpConv::expectedCodeSizeDelta() const {
 
 namespace {
 class AArch64ConditionalCompares : public MachineFunctionPass {
+  const MachineBranchProbabilityInfo *MBPI;
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
   MCSchedModel SchedModel;
@@ -753,6 +790,7 @@ char AArch64ConditionalCompares::ID = 0;
 
 INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
                       "AArch64 CCMP Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
 INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
@@ -763,6 +801,7 @@ FunctionPass *llvm::createAArch64ConditionalCompares() {
 }
 
 void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
   AU.addRequired<MachineDominatorTree>();
   AU.addPreserved<MachineDominatorTree>();
   AU.addRequired<MachineLoopInfo>();
@@ -892,12 +931,13 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   DomTree = &getAnalysis<MachineDominatorTree>();
   Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   Traces = &getAnalysis<MachineTraceMetrics>();
   MinInstr = nullptr;
   MinSize = MF.getFunction()->optForMinSize();
 
   bool Changed = false;
-  CmpConv.runOnMachineFunction(MF);
+  CmpConv.runOnMachineFunction(MF, MBPI);
 
   // Visit blocks in dominator tree pre-order. The pre-order enables multiple
   // cmp-conversions from the same head block.
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2965106fd270..aaf32a499bc3 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7561,8 +7561,9 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
 
       // Convert the integer vector to pointer vector if the element is pointer.
       if (EltTy->isPointerTy())
-        SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType());
-
+        SubVec = Builder.CreateIntToPtr(
+            SubVec, VectorType::get(SVI->getType()->getVectorElementType(),
+                                    VecTy->getVectorNumElements()));
       SubVecs[SVI].push_back(SubVec);
     }
   }
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index ad24612239fa..6cb723d187af 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -735,7 +735,7 @@ def : ShiftAlias<"rorv", RORVWr, GPR32>;
 def : ShiftAlias<"rorv", RORVXr, GPR64>;
 
 // Multiply-add
-let AddedComplexity = 7 in {
+let AddedComplexity = 5 in {
 defm MADD : MulAccum<0, "madd", add>;
 defm MSUB : MulAccum<1, "msub", sub>;
 
@@ -752,7 +752,7 @@ def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)),
           (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
 def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)),
           (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
-} // AddedComplexity = 7
+} // AddedComplexity = 5
 
 let AddedComplexity = 5 in {
 def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 9bfd570e9a82..07ce0e863c5e 100644
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -947,7 +947,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
     const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
 
     if (DstRB.getID() != SrcRB.getID()) {
-      DEBUG(dbgs() << "G_TRUNC input/output on different banks\n");
+      DEBUG(dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
       return false;
     }
 
@@ -964,16 +964,21 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
 
       if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
           !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
-        DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+        DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
         return false;
       }
 
       if (DstRC == SrcRC) {
         // Nothing to be done
+      } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
+                 SrcTy == LLT::scalar(64)) {
+        llvm_unreachable("TableGen can import this case");
+        return false;
       } else if (DstRC == &AArch64::GPR32RegClass &&
                  SrcRC == &AArch64::GPR64RegClass) {
         I.getOperand(1).setSubReg(AArch64::sub_32);
       } else {
+        DEBUG(dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
         return false;
       }
 
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 01196817f311..4b568f3fba2b 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -39,6 +39,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
   const LLT v4s32 = LLT::vector(4, 32);
   const LLT v2s64 = LLT::vector(2, 64);
 
+  for (auto Ty : {p0, s1, s8, s16, s32, s64})
+    setAction({G_IMPLICIT_DEF, Ty}, Legal);
+
   for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL}) {
     // These operations naturally get the right answer when used on
     // GPR32, even if the actual type is narrower.
@@ -99,6 +102,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
     // G_INSERT (It seems entirely reasonable that inputs shouldn't overlap).
   }
 
+  for (auto Ty : {s1, s8, s16, s32, s64, p0})
+    setAction({G_EXTRACT, Ty}, Legal);
+
+  for (auto Ty : {s32, s64})
+    setAction({G_EXTRACT, 1, Ty}, Legal);
+
   for (unsigned MemOp : {G_LOAD, G_STORE}) {
     for (auto Ty : {s8, s16, s32, s64, p0, v2s32})
       setAction({MemOp, Ty}, Legal);
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index 45083df7ab45..f82b9dbc2c9f 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -151,13 +151,24 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
   return MCOperand::createExpr(Expr);
 }
 
+MCOperand AArch64MCInstLower::lowerSymbolOperandCOFF(const MachineOperand &MO,
+                                                     MCSymbol *Sym) const {
+  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::createAdd(
+        Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+  return MCOperand::createExpr(Expr);
+}
+
 MCOperand AArch64MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                                  MCSymbol *Sym) const {
   if (Printer.TM.getTargetTriple().isOSDarwin())
     return lowerSymbolOperandDarwin(MO, Sym);
+  if (Printer.TM.getTargetTriple().isOSBinFormatCOFF())
+    return lowerSymbolOperandCOFF(MO, Sym);
 
-  assert(Printer.TM.getTargetTriple().isOSBinFormatELF() &&
-         "Expect Darwin or ELF target");
+  assert(Printer.TM.getTargetTriple().isOSBinFormatELF() && "Invalid target");
   return lowerSymbolOperandELF(MO, Sym);
 }
 
diff --git a/lib/Target/AArch64/AArch64MCInstLower.h b/lib/Target/AArch64/AArch64MCInstLower.h
index 1e29b80c2d62..aa30fe1fa707 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.h
+++ b/lib/Target/AArch64/AArch64MCInstLower.h
@@ -42,6 +42,8 @@ public:
                                      MCSymbol *Sym) const;
   MCOperand lowerSymbolOperandELF(const MachineOperand &MO,
                                   MCSymbol *Sym) const;
+  MCOperand lowerSymbolOperandCOFF(const MachineOperand &MO,
+                                   MCSymbol *Sym) const;
   MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
 
   MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index baf15ac540cf..fab92e139dd0 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -94,7 +94,7 @@ const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
   if (TT.isOSDarwin())
     return CSR_AArch64_TLS_Darwin_RegMask;
 
-  assert(TT.isOSBinFormatELF() && "only expect Darwin or ELF TLS");
+  assert(TT.isOSBinFormatELF() && "Invalid target");
   return CSR_AArch64_TLS_ELF_RegMask;
 }
 
diff --git a/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/lib/Target/AArch64/AArch64SchedThunderX2T99.td
index 3654eeca530a..10df50bcf156 100644
--- a/lib/Target/AArch64/AArch64SchedThunderX2T99.td
+++ b/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -1,4 +1,4 @@
-//=- AArch64SchedThunderX2T99.td - Cavium ThunderX T99 Scheduling ---*- tablegen -*-=//
+//=- AArch64SchedThunderX2T99.td - Cavium ThunderX T99 ---*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -79,75 +79,207 @@ def THX2T99LS01 : ProcResGroup<[THX2T99P4, THX2T99P5]>;
 
 // 60 entry unified scheduler.
 def THX2T99Any : ProcResGroup<[THX2T99P0, THX2T99P1, THX2T99P2,
-                              THX2T99P3, THX2T99P4, THX2T99P5]> {
-  let BufferSize=60;
+                               THX2T99P3, THX2T99P4, THX2T99P5]> {
+  let BufferSize = 60;
 }
 
 // Define commonly used write types for InstRW specializations.
 // All definitions follow the format: THX2T99Write_<NumCycles>Cyc_<Resources>.
 
 // 3 cycles on I1.
-def THX2T99Write_3Cyc_I1 : SchedWriteRes<[THX2T99I1]> { let Latency = 3; }
+def THX2T99Write_3Cyc_I1 : SchedWriteRes<[THX2T99I1]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+// 1 cycles on I2.
+def THX2T99Write_1Cyc_I2 : SchedWriteRes<[THX2T99I2]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
 
 // 4 cycles on I1.
-def THX2T99Write_4Cyc_I1 : SchedWriteRes<[THX2T99I1]> { let Latency = 4; }
+def THX2T99Write_4Cyc_I1 : SchedWriteRes<[THX2T99I1]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+// 23 cycles on I1.
+def THX2T99Write_23Cyc_I1 : SchedWriteRes<[THX2T99I1]> {
+  let Latency = 23;
+  let ResourceCycles = [13, 23];
+  let NumMicroOps = 4;
+}
+
+// 39 cycles on I1.
+def THX2T99Write_39Cyc_I1 : SchedWriteRes<[THX2T99I1]> {
+  let Latency = 39;
+  let ResourceCycles = [13, 39];
+  let NumMicroOps = 4;
+}
 
 // 1 cycle on I0, I1, or I2.
-def THX2T99Write_1Cyc_I012 : SchedWriteRes<[THX2T99I012]> { let Latency = 1; }
+def THX2T99Write_1Cyc_I012 : SchedWriteRes<[THX2T99I012]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// 2 cycles on I0, I1, or I2.
+def THX2T99Write_2Cyc_I012 : SchedWriteRes<[THX2T99I012]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+// 4 cycles on I0, I1, or I2.
+def THX2T99Write_4Cyc_I012 : SchedWriteRes<[THX2T99I012]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+// 5 cycles on I0, I1, or I2.
+def THX2T99Write_5Cyc_I012 : SchedWriteRes<[THX2T99I012]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
 
 // 5 cycles on F1.
-def THX2T99Write_5Cyc_F1 : SchedWriteRes<[THX2T99F1]> { let Latency = 5; }
+def THX2T99Write_5Cyc_F1 : SchedWriteRes<[THX2T99F1]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
 
 // 7 cycles on F1.
-def THX2T99Write_7Cyc_F1 : SchedWriteRes<[THX2T99F1]> { let Latency = 7; }
+def THX2T99Write_7Cyc_F1 : SchedWriteRes<[THX2T99F1]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
 
 // 4 cycles on F0 or F1.
-def THX2T99Write_4Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 4; }
+def THX2T99Write_4Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
 
 // 5 cycles on F0 or F1.
-def THX2T99Write_5Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 5; }
+def THX2T99Write_5Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
 
 // 6 cycles on F0 or F1.
-def THX2T99Write_6Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 6; }
+def THX2T99Write_6Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
 
 // 7 cycles on F0 or F1.
-def THX2T99Write_7Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 7; }
+def THX2T99Write_7Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
 
 // 8 cycles on F0 or F1.
-def THX2T99Write_8Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 8; }
+def THX2T99Write_8Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+
+// 10 cycles on F0 or F1.
+def THX2T99Write_10Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+}
 
 // 16 cycles on F0 or F1.
 def THX2T99Write_16Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
   let Latency = 16;
+  let NumMicroOps = 3;
   let ResourceCycles = [8];
 }
 
 // 23 cycles on F0 or F1.
 def THX2T99Write_23Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
   let Latency = 23;
+  let NumMicroOps = 3;
   let ResourceCycles = [11];
 }
 
 // 1 cycles on LS0 or LS1.
-def THX2T99Write_1Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 1; }
+def THX2T99Write_1Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+  let Latency = 0;
+}
+
+// 1 cycles on LS0 or LS1 and I0, I1, or I2.
+def THX2T99Write_1Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> {
+  let Latency = 0;
+  let NumMicroOps = 2;
+}
+
+// 1 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
+def THX2T99Write_1Cyc_LS01_I012_I012 :
+  SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> {
+  let Latency = 0;
+  let NumMicroOps = 3;
+}
+
+// 2 cycles on LS0 or LS1.
+def THX2T99Write_2Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
 
 // 4 cycles on LS0 or LS1.
-def THX2T99Write_4Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 4; }
+def THX2T99Write_4Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
 
 // 5 cycles on LS0 or LS1.
-def THX2T99Write_5Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 5; }
+def THX2T99Write_5Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
 
 // 6 cycles on LS0 or LS1.
-def THX2T99Write_6Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 6; }
+def THX2T99Write_6Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+// 4 cycles on LS0 or LS1 and I0, I1, or I2.
+def THX2T99Write_4Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+
+// 4 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
+def THX2T99Write_4Cyc_LS01_I012_I012 :
+  SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
 
 // 5 cycles on LS0 or LS1 and I0, I1, or I2.
 def THX2T99Write_5Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> {
   let Latency = 5;
-  let NumMicroOps = 2;
+  let NumMicroOps = 3;
 }
 
 // 5 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
-def THX2T99Write_6Cyc_LS01_I012_I012 : 
+def THX2T99Write_5Cyc_LS01_I012_I012 :
+  SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+// 6 cycles on LS0 or LS1 and I0, I1, or I2.
+def THX2T99Write_6Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+}
+
+// 6 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
+def THX2T99Write_6Cyc_LS01_I012_I012 :
   SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> {
   let Latency = 6;
   let NumMicroOps = 3;
@@ -162,25 +294,25 @@ def THX2T99Write_1Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
 // 5 cycles on LS0 or LS1 and F0 or F1.
 def THX2T99Write_5Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
   let Latency = 5;
-  let NumMicroOps = 2;
+  let NumMicroOps = 3;
 }
 
 // 6 cycles on LS0 or LS1 and F0 or F1.
 def THX2T99Write_6Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
   let Latency = 6;
-  let NumMicroOps = 2;
+  let NumMicroOps = 3;
 }
 
 // 7 cycles on LS0 or LS1 and F0 or F1.
 def THX2T99Write_7Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
   let Latency = 7;
-  let NumMicroOps = 2;
+  let NumMicroOps = 3;
 }
 
 // 8 cycles on LS0 or LS1 and F0 or F1.
 def THX2T99Write_8Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
   let Latency = 8;
-  let NumMicroOps = 2;
+  let NumMicroOps = 3;
 }
 
 // Define commonly used read types.
@@ -195,10 +327,8 @@ def : ReadAdvance<ReadID,      0>;
 def : ReadAdvance<ReadExtrHi,  0>;
 def : ReadAdvance<ReadAdrBase, 0>;
 def : ReadAdvance<ReadVLD,     0>;
-
 }
 
-
 //===----------------------------------------------------------------------===//
 // 3. Instruction Tables.
 
@@ -211,88 +341,217 @@ let SchedModel = ThunderX2T99Model in {
 // Branch, immed
 // Branch and link, immed
 // Compare and branch
-def : WriteRes<WriteBr,      [THX2T99I2]> { let Latency = 1; }
+def : WriteRes<WriteBr,      [THX2T99I2]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Branch, register
+// Branch and link, register != LR
+// Branch and link, register = LR
+def : WriteRes<WriteBrReg,   [THX2T99I2]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
 
 def : WriteRes<WriteSys,     []> { let Latency = 1; }
 def : WriteRes<WriteBarrier, []> { let Latency = 1; }
 def : WriteRes<WriteHint,    []> { let Latency = 1; }
 
-def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
+def : WriteRes<WriteAtomic,  []> {
+  let Unsupported = 1;
+  let NumMicroOps = 2;
+}
 
-// Branch, register
-// Branch and link, register != LR
-// Branch and link, register = LR
-def : WriteRes<WriteBrReg,   [THX2T99I2]> { let Latency = 1; }
+//---
+// Branch
+//---
+def : InstRW<[THX2T99Write_1Cyc_I2], (instrs B, BL, BR, BLR)>;
+def : InstRW<[THX2T99Write_1Cyc_I2], (instrs RET)>;
+def : InstRW<[THX2T99Write_1Cyc_I2], (instregex "^B.*")>;
+def : InstRW<[THX2T99Write_1Cyc_I2],
+            (instregex "^CBZ", "^CBNZ", "^TBZ", "^TBNZ")>;
 
 //---
 // 3.2 Arithmetic and Logical Instructions
 // 3.3 Move and Shift Instructions
 //---
 
+
 // ALU, basic
 // Conditional compare
 // Conditional select
 // Address generation
-def : WriteRes<WriteI,       [THX2T99I012]> { let Latency = 1; }
+def : WriteRes<WriteI,       [THX2T99I012]> {
+  let Latency = 1;
+  let ResourceCycles = [1, 3];
+  let NumMicroOps = 2;
+}
+
+def : InstRW<[WriteI],
+            (instregex "ADD?(W|X)r(i|r|s|x)",   "ADDS?(W|X)r(i|r|s|x)(64)?",
+                       "AND?(W|X)r(i|r|s|x)",   "ANDS?(W|X)r(i|r|s|x)",
+                       "ADC?(W|X)r(i|r|s|x)",   "ADCS?(W|X)r(i|r|s|x)",
+                       "BIC?(W|X)r(i|r|s|x)",   "BICS?(W|X)r(i|r|s|x)",
+                       "EON?(W|X)r(i|r|s|x)",   "ORN?(W|X)r(i|r|s|x)",
+                       "ORR?(W|X)r(i|r|s|x)",   "SUB?(W|X)r(i|r|s|x)",
+                       "SUBS?(W|X)r(i|r|s|x)",  "SBC?(W|X)r(i|r|s|x)",
+                       "SBCS?(W|X)r(i|r|s|x)",  "CCMN?(W|X)r(i|r|s|x)",
+                       "CCMP?(W|X)r(i|r|s|x)",  "CSEL?(W|X)r(i|r|s|x)",
+                       "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)",
+                       "CSNEG?(W|X)r(i|r|s|x)")>;
+
 def : InstRW<[WriteI], (instrs COPY)>;
 
 // ALU, extend and/or shift
 def : WriteRes<WriteISReg,   [THX2T99I012]> {
   let Latency = 2;
-  let ResourceCycles = [2];
+  let ResourceCycles = [2, 3];
+  let NumMicroOps = 2;
 }
 
+def : InstRW<[WriteISReg],
+            (instregex "ADD?(W|X)r(i|r|s|x)",   "ADDS?(W|X)r(i|r|s|x)(64)?",
+                       "AND?(W|X)r(i|r|s|x)",   "ANDS?(W|X)r(i|r|s|x)",
+                       "ADC?(W|X)r(i|r|s|x)",   "ADCS?(W|X)r(i|r|s|x)",
+                       "BIC?(W|X)r(i|r|s|x)",   "BICS?(W|X)r(i|r|s|x)",
+                       "EON?(W|X)r(i|r|s|x)",   "ORN?(W|X)r(i|r|s|x)",
+                       "ORR?(W|X)r(i|r|s|x)",   "SUB?(W|X)r(i|r|s|x)",
+                       "SUBS?(W|X)r(i|r|s|x)",  "SBC?(W|X)r(i|r|s|x)",
+                       "SBCS?(W|X)r(i|r|s|x)",  "CCMN?(W|X)r(i|r|s|x)",
+                       "CCMP?(W|X)r(i|r|s|x)",  "CSEL?(W|X)r(i|r|s|x)",
+                       "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)",
+                       "CSNEG?(W|X)r(i|r|s|x)")>;
+
 def : WriteRes<WriteIEReg,   [THX2T99I012]> {
-  let Latency = 2;
-  let ResourceCycles = [2];
+  let Latency = 1;
+  let ResourceCycles = [1, 3];
+  let NumMicroOps = 2;
 }
 
+def : InstRW<[WriteIEReg],
+            (instregex "ADD?(W|X)r(i|r|s|x)",   "ADDS?(W|X)r(i|r|s|x)(64)?",
+                       "AND?(W|X)r(i|r|s|x)",   "ANDS?(W|X)r(i|r|s|x)",
+                       "ADC?(W|X)r(i|r|s|x)",   "ADCS?(W|X)r(i|r|s|x)",
+                       "BIC?(W|X)r(i|r|s|x)",   "BICS?(W|X)r(i|r|s|x)",
+                       "EON?(W|X)r(i|r|s|x)",   "ORN?(W|X)r(i|r|s|x)",
+                       "ORR?(W|X)r(i|r|s|x)",   "SUB?(W|X)r(i|r|s|x)",
+                       "SUBS?(W|X)r(i|r|s|x)",  "SBC?(W|X)r(i|r|s|x)",
+                       "SBCS?(W|X)r(i|r|s|x)",  "CCMN?(W|X)r(i|r|s|x)",
+                       "CCMP?(W|X)r(i|r|s|x)",  "CSEL?(W|X)r(i|r|s|x)",
+                       "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)",
+                       "CSNEG?(W|X)r(i|r|s|x)")>;
+
 // Move immed
-def : WriteRes<WriteImm,     [THX2T99I012]> { let Latency = 1; }
+def : WriteRes<WriteImm,     [THX2T99I012]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def : InstRW<[THX2T99Write_1Cyc_I012],
+            (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
+
+def : InstRW<[THX2T99Write_1Cyc_I012],
+            (instrs ASRVWr, ASRVXr, LSLVWr, LSLVXr, RORVWr, RORVXr)>;
 
 // Variable shift
-def : WriteRes<WriteIS,      [THX2T99I012]> { let Latency = 1; }
+def : WriteRes<WriteIS,      [THX2T99I012]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
 
 //---
 // 3.4 Divide and Multiply Instructions
 //---
 
 // Divide, W-form
-// Latency range of 13-23.  Take the average.
+// Latency range of 13-23/13-39.
 def : WriteRes<WriteID32,    [THX2T99I1]> {
-  let Latency = 18;
-  let ResourceCycles = [18];
+  let Latency = 39;
+  let ResourceCycles = [13, 39];
+  let NumMicroOps = 4;
 }
 
 // Divide, X-form
-// Latency range of 13-39.  Take the average.
 def : WriteRes<WriteID64,    [THX2T99I1]> {
-  let Latency = 26;
-  let ResourceCycles = [26];
+  let Latency = 23;
+  let ResourceCycles = [13, 23];
+  let NumMicroOps = 4;
 }
 
 // Multiply accumulate, W-form
-def : WriteRes<WriteIM32,    [THX2T99I012]> { let Latency = 5; }
+def : WriteRes<WriteIM32,    [THX2T99I012]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
 
 // Multiply accumulate, X-form
-def : WriteRes<WriteIM64,    [THX2T99I012]> { let Latency = 5; }
+def : WriteRes<WriteIM64,    [THX2T99I012]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+//def : InstRW<[WriteIM32, ReadIM, ReadIM, ReadIMA, THX2T99Write_5Cyc_I012],
+//             (instrs MADDWrrr, MSUBWrrr)>;
+def : InstRW<[WriteIM32], (instrs MADDWrrr, MSUBWrrr)>;
+def : InstRW<[WriteIM32], (instrs MADDXrrr, MSUBXrrr)>;
+def : InstRW<[THX2T99Write_5Cyc_I012],
+            (instregex "(S|U)(MADDL|MSUBL)rrr")>;
+
+def : InstRW<[WriteID32], (instrs SDIVWr, UDIVWr)>;
+def : InstRW<[WriteID64], (instrs SDIVXr, UDIVXr)>;
 
 // Bitfield extract, two reg
-def : WriteRes<WriteExtr,    [THX2T99I012]> { let Latency = 1; }
+def : WriteRes<WriteExtr,    [THX2T99I012]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Multiply high
+def : InstRW<[THX2T99Write_4Cyc_I1], (instrs SMULHrr, UMULHrr)>;
+
+// Miscellaneous Data-Processing Instructions
+// Bitfield extract
+def : InstRW<[THX2T99Write_1Cyc_I012], (instrs EXTRWrri, EXTRXrri)>;
+
+// Bitifield move - basic
+def : InstRW<[THX2T99Write_1Cyc_I012],
+            (instrs SBFMWri, SBFMXri, UBFMWri, UBFMXri)>;
 
-// Bitfield move, basic
 // Bitfield move, insert
-// NOTE: Handled by WriteIS.
+def : InstRW<[THX2T99Write_1Cyc_I012], (instregex "^BFM")>;
+def : InstRW<[THX2T99Write_1Cyc_I012], (instregex "(S|U)?BFM.*")>;
 
 // Count leading
 def : InstRW<[THX2T99Write_3Cyc_I1], (instregex "^CLS(W|X)r$",
-                                               "^CLZ(W|X)r$")>;
+                                                "^CLZ(W|X)r$")>;
+
+// Reverse bits
+def : InstRW<[THX2T99Write_1Cyc_I012], (instrs RBITWr, RBITXr)>;
+
+// Cryptography Extensions
+def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^AES[DE]")>;
+def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^AESI?MC")>;
+def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^PMULL")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA1SU0")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA1(H|SU1)")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA1[CMP]")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA256SU0")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA256(H|H2|SU1)")>;
+
+// CRC Instructions
+// def : InstRW<[THX2T99Write_4Cyc_I1], (instregex "^CRC32", "^CRC32C")>;
+def : InstRW<[THX2T99Write_4Cyc_I1],
+            (instrs CRC32Brr, CRC32Hrr, CRC32Wrr, CRC32Xrr)>;
+
+def : InstRW<[THX2T99Write_4Cyc_I1],
+            (instrs CRC32CBrr, CRC32CHrr, CRC32CWrr, CRC32CXrr)>;
 
 // Reverse bits/bytes
 // NOTE: Handled by WriteI.
 
 //---
-// 3.6 Load Instructions 
+// 3.6 Load Instructions
 // 3.10 FP Load Instructions
 //---
 
@@ -300,13 +559,29 @@ def : InstRW<[THX2T99Write_3Cyc_I1], (instregex "^CLS(W|X)r$",
 // Load register, unscaled immed
 // Load register, immed unprivileged
 // Load register, unsigned immed
-def : WriteRes<WriteLD,      [THX2T99LS01]> { let Latency = 4; }
+def : WriteRes<WriteLD,      [THX2T99LS01]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
 
 // Load register, immed post-index
 // NOTE: Handled by WriteLD, WriteI.
 // Load register, immed pre-index
 // NOTE: Handled by WriteLD, WriteAdr.
-def : WriteRes<WriteAdr,     [THX2T99I012]> { let Latency = 1; }
+def : WriteRes<WriteAdr,     [THX2T99I012]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Load pair, immed offset, normal
+// Load pair, immed offset, signed words, base != SP
+// Load pair, immed offset signed words, base = SP
+// LDP only breaks into *one* LS micro-op.  Thus
+// the resources are handled by WriteLD.
+def : WriteRes<WriteLDHi,    []> {
+  let Latency = 5;
+  let NumMicroOps = 5;
+}
 
 // Load register offset, basic
 // Load register, register offset, scale by 4/8
@@ -324,23 +599,229 @@ def THX2T99ReadAdrBase : SchedReadVariant<[
   SchedVar<NoSchedPred,   [ReadDefault]>]>;
 def : SchedAlias<ReadAdrBase, THX2T99ReadAdrBase>;
 
-// Load pair, immed offset, normal
-// Load pair, immed offset, signed words, base != SP
-// Load pair, immed offset signed words, base = SP
-// LDP only breaks into *one* LS micro-op.  Thus
-// the resources are handling by WriteLD.
-def : WriteRes<WriteLDHi,    []> {
-  let Latency = 5;
-}
-
 // Load pair, immed pre-index, normal
 // Load pair, immed pre-index, signed words
 // Load pair, immed post-index, normal
 // Load pair, immed post-index, signed words
 // NOTE: Handled by WriteLD, WriteLDHi, WriteAdr.
 
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPDi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPQi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPSi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPWi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPXi)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPDi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPQi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPSi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPSWi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPWi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPXi)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRBui)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRDui)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRHui)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01], (instrs LDRQui)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01], (instrs LDRSui)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRDl)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRQl)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRWl)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRXl)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRBi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRHi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRXi)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSBWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSBXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSHWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSHXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSWi)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPDpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPQpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPSpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPWpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPWpre)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRBpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRDpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRHpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRQpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRSpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRWpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRXpre)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBWpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBXpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBWpost)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBXpost)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHWpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHXpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHWpost)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHXpost)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRBBpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRBBpost)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRHHpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRHHpost)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPDpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPQpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPSpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPWpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPXpost)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRBpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRDpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRHpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRQpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRSpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRWpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRXpost)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPDpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPQpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPSpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPWpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPXpre)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRBpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRDpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRHpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRQpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRSpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRWpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRXpre)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPDpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPQpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPSpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPWpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPXpost)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRBpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRDpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRHpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRQpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRSpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRWpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRXpost)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRBroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRDroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHHroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRQroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHWroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHXroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRWroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRXroW)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRBroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRDroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHHroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRQroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHWroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHXroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRWroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRXroX)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRBroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRBroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+             (instrs LDRDroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRHroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRHHroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRQroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRSroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRSHWroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRSHXroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRWroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRXroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRBroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRDroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRHroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRHHroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRQroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRSroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRSHWroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRSHXroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRWroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRXroX)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURBi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURBBi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURDi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURHi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURHHi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURQi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSBWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSBXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSHWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSHXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSWi)>;
+
+//---
+// Prefetch
+//---
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMl)>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFUMi)>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMui)>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMroW)>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMroX)>;
+
 //--
-// 3.7 Store Instructions 
+// 3.7 Store Instructions
 // 3.11 FP Store Instructions
 //--
 
@@ -382,6 +863,195 @@ def : WriteRes<WriteSTP,     [THX2T99LS01, THX2T99SD]> {
 // Store pair, immed pre-index, X-form
 // NOTE: Handled by WriteAdr, WriteSTP.
 
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURBi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURBBi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURDi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURHi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURHHi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURQi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURSi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURWi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURXi)>;
+
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRBi)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRHi)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRWi)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRXi)>;
+
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPDi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPQi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPXi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPWi)>;
+
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPDi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPQi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPXi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPWi)>;
+
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRBui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRBui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRDui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRDui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRHui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRHui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRQui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRQui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRXui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRXui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRWui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRWui)>;
+
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STPXpre, STPXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STPXpre, STPXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STPXpre, STPXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STPXpre, STPXpost)>;
+
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRXpre, STRXpost)>;
+
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRBroW, STRBroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRBroW, STRBroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRBBroW, STRBBroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRBBroW, STRBBroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRDroW, STRDroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRDroW, STRDroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRHroW, STRHroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRHroW, STRHroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRHHroW, STRHHroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRHHroW, STRHHroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRQroW, STRQroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRQroW, STRQroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRSroW, STRSroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRSroW, STRSroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRWroW, STRWroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRWroW, STRWroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRXroW, STRXroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRXroW, STRXroX)>;
+
 //---
 // 3.8 FP Data Processing Instructions
 //---
@@ -389,28 +1059,95 @@ def : WriteRes<WriteSTP,     [THX2T99LS01, THX2T99SD]> {
 // FP absolute value
 // FP min/max
 // FP negate
-def : WriteRes<WriteF,       [THX2T99F01]> { let Latency = 5; }
+def : WriteRes<WriteF,       [THX2T99F01]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
 
 // FP arithmetic
 def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FADD", "^FSUB")>;
 
 // FP compare
-def : WriteRes<WriteFCmp,    [THX2T99F01]> { let Latency = 5; }
+def : WriteRes<WriteFCmp,    [THX2T99F01]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
 
-// FP divide, S-form
-// FP square root, S-form
-def : WriteRes<WriteFDiv,    [THX2T99F01]> {
+// FP Mul, Div, Sqrt
+def : WriteRes<WriteFDiv, [THX2T99F01]> {
+  let Latency = 22;
+  let ResourceCycles = [19];
+}
+
+def THX2T99XWriteFDiv : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 16;
+  let ResourceCycles = [8];
+  let NumMicroOps = 4;
+}
+
+def THX2T99XWriteFDivSP : SchedWriteRes<[THX2T99F01]> {
   let Latency = 16;
   let ResourceCycles = [8];
+  let NumMicroOps = 4;
 }
 
+def THX2T99XWriteFDivDP : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 23;
+  let ResourceCycles = [12];
+  let NumMicroOps = 4;
+}
+
+def THX2T99XWriteFSqrtSP : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 16;
+  let ResourceCycles = [8];
+  let NumMicroOps = 4;
+}
+
+def THX2T99XWriteFSqrtDP : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 23;
+  let ResourceCycles = [12];
+  let NumMicroOps = 4;
+}
+
+// FP divide, S-form
+// FP square root, S-form
+def : InstRW<[THX2T99XWriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[THX2T99XWriteFSqrtSP], (instrs FSQRTSr)>;
+def : InstRW<[THX2T99XWriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[THX2T99XWriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "^FDIVSrr", "^FSQRTSrr")>;
+
 // FP divide, D-form
 // FP square root, D-form
-def : InstRW<[THX2T99Write_23Cyc_F01], (instrs FDIVDrr, FSQRTDr)>;
+def : InstRW<[THX2T99XWriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[THX2T99XWriteFSqrtDP], (instrs FSQRTDr)>;
+def : InstRW<[THX2T99XWriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[THX2T99XWriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+def : InstRW<[THX2T99Write_23Cyc_F01], (instregex "^FDIVDrr", "^FSQRTDrr")>;
 
 // FP multiply
 // FP multiply accumulate
-def : WriteRes<WriteFMul, [THX2T99F01]> { let Latency = 6; }
+def : WriteRes<WriteFMul, [THX2T99F01]> {
+  let Latency = 6;
+  let ResourceCycles = [2];
+  let NumMicroOps = 3;
+}
+
+def THX2T99XWriteFMul : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 6;
+  let ResourceCycles = [2];
+  let NumMicroOps = 3;
+}
+
+def THX2T99XWriteFMulAcc : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 6;
+  let ResourceCycles = [2];
+  let NumMicroOps = 3;
+}
+
+def : InstRW<[THX2T99XWriteFMul], (instregex "^FMUL", "^FNMUL")>;
+def : InstRW<[THX2T99XWriteFMulAcc],
+            (instregex "^FMADD", "^FMSUB", "^FNMADD", "^FNMSUB")>;
 
 // FP round to integral
 def : InstRW<[THX2T99Write_7Cyc_F01],
@@ -426,15 +1163,25 @@ def : InstRW<[THX2T99Write_4Cyc_F01], (instregex "^FCSEL")>;
 // FP convert, from vec to vec reg
 // FP convert, from gen to vec reg
 // FP convert, from vec to gen reg
-def : WriteRes<WriteFCvt, [THX2T99F01]> { let Latency = 7; }
+def : WriteRes<WriteFCvt, [THX2T99F01]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
 
 // FP move, immed
 // FP move, register
-def : WriteRes<WriteFImm, [THX2T99F01]> { let Latency = 4; }
+def : WriteRes<WriteFImm, [THX2T99F01]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
 
 // FP transfer, from gen to vec reg
 // FP transfer, from vec to gen reg
-def : WriteRes<WriteFCopy, [THX2T99F01]> { let Latency = 4; }
+def : WriteRes<WriteFCopy, [THX2T99F01]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
 def : InstRW<[THX2T99Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
 
 //---
@@ -470,19 +1217,135 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
 // ASIMD shift by register, basic, Q-form
 // ASIMD shift by register, complex, D-form
 // ASIMD shift by register, complex, Q-form
-def : WriteRes<WriteV, [THX2T99F01]> { let Latency = 7; }
+def : WriteRes<WriteV, [THX2T99F01]> {
+  let Latency = 7;
+  let NumMicroOps = 4;
+  let ResourceCycles = [4, 23];
+}
 
 // ASIMD arith, reduce, 4H/4S
 // ASIMD arith, reduce, 8B/8H
 // ASIMD arith, reduce, 16B
-def : InstRW<[THX2T99Write_5Cyc_F01], 
-            (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
 
 // ASIMD logical (MOV, MVN, ORN, ORR)
-def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^ORRv", "^ORNv", "^NOTv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^ANDv", "^BICv", "^EORv", "^MOVv", "^MVNv",
+                       "^ORRv", "^ORNv", "^NOTv")>;
+// ASIMD arith, reduce
+def : InstRW<[THX2T99Write_10Cyc_F01],
+            (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
 
 // ASIMD polynomial (8x8) multiply long
-def : InstRW<[THX2T99Write_5Cyc_F01], (instrs PMULLv8i8, PMULLv16i8)>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^(S|U|SQD)MULL")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>;
+def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^PMULL(v8i8|v16i8)")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^PMULL(v1i64|v2i64)")>;
+
+// ASIMD absolute diff accum, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
+// ASIMD absolute diff accum, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
+// ASIMD absolute diff accum long
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU]ABAL")>;
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>;
+// ASIMD arith, reduce, 8B
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>;
+// ASIMD arith, reduce, 16B/16H
+def : InstRW<[THX2T99Write_10Cyc_F01],
+            (instregex "^[SU]?ADDL?Vv16i8v$")>;
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[THX2T99Write_10Cyc_F01],
+            (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>;
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>;
+// ASIMD max/min, reduce, 16B/16H
+def : InstRW<[THX2T99Write_10Cyc_F01],
+            (instregex "^[SU](MIN|MAX)Vv16i8v$")>;
+// ASIMD multiply, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^(P?MUL|SQR?DMULH)" #
+                       "(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)" #
+                       "(_indexed)?$")>;
+// ASIMD multiply, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+// ASIMD multiply accumulate, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
+// ASIMD multiply accumulate, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
+// ASIMD shift accumulate
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
+
+// ASIMD shift by immed, basic
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "RSHRNv","SHRNv", "SQRSHRNv","SQRSHRUNv",
+                       "SQSHRNv","SQSHRUNv", "UQRSHRNv",
+                       "UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
+// ASIMD shift by immed, complex
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^[SU]?(Q|R){1,2}SHR")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SQSHLU")>;
+// ASIMD shift by register, basic, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+// ASIMD shift by register, complex, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU][QR]{1,2}SHL" #
+                       "(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>;
+// ASIMD shift by register, complex, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD Arithmetic
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "(ADD|SUB)HNv.*")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "(RADD|RSUB)HNv.*")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^SQADD", "^SQNEG", "^SQSUB", "^SRHADD",
+                       "^SUQADD", "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|" #
+                       "(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>;
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SADALP","^UADALP")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SADDLPv","^UADDLPv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SADDLV","^UADDLV")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+             (instregex "^ADDVv","^SMAXVv","^UMAXVv","^SMINVv","^UMINVv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+             (instregex "^SABAv","^UABAv","^SABALv","^UABALv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^SQADDv","^SQSUBv","^UQADDv","^UQSUBv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SUQADDv","^USQADDv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^ADDHNv","^RADDHNv", "^RSUBHNv",
+                       "^SQABS", "^SQADD", "^SQNEG", "^SQSUB",
+                       "^SRHADD", "^SUBHNv", "^SUQADD",
+                       "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^CMEQv","^CMGEv","^CMGTv",
+                       "^CMLEv","^CMLTv", "^CMHIv","^CMHSv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^SMAXv","^SMINv","^UMAXv","^UMINv",
+                       "^SMAXPv","^SMINPv","^UMAXPv","^UMINPv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^SABDv","^UABDv", "^SABDLv","^UABDLv")>;
 
 //---
 // 3.13 ASIMD Floating-point Instructions
@@ -493,7 +1356,8 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FABSv")>;
 
 // ASIMD FP arith, normal, D-form
 // ASIMD FP arith, normal, Q-form
-def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FABDv", "^FADDv", "^FSUBv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01],
+            (instregex "^FABDv", "^FADDv", "^FSUBv")>;
 
 // ASIMD FP arith,pairwise, D-form
 // ASIMD FP arith, pairwise, Q-form
@@ -503,8 +1367,15 @@ def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FADDPv")>;
 // ASIMD FP compare, Q-form
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>;
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv",
-                                                "^FCMGTv", "^FCMLEv",
-                                                "^FCMLTv")>;
+                                                 "^FCMGTv", "^FCMLEv",
+                                                 "^FCMLTv")>;
+
+// ASIMD FP round, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^FRINT[AIMNPXZ](v2f32)")>;
+// ASIMD FP round, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>;
 
 // ASIMD FP convert, long
 // ASIMD FP convert, narrow
@@ -512,14 +1383,26 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv",
 // ASIMD FP convert, other, Q-form
 // NOTE: Handled by WriteV.
 
+// ASIMD FP convert, long and narrow
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^FCVT(L|N|XN)v")>;
+// ASIMD FP convert, other, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+      (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>;
+// ASIMD FP convert, other, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+      (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>;
+
 // ASIMD FP divide, D-form, F32
 def : InstRW<[THX2T99Write_16Cyc_F01], (instrs FDIVv2f32)>;
+def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "FDIVv2f32")>;
 
 // ASIMD FP divide, Q-form, F32
 def : InstRW<[THX2T99Write_16Cyc_F01], (instrs FDIVv4f32)>;
+def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "FDIVv4f32")>;
 
 // ASIMD FP divide, Q-form, F64
 def : InstRW<[THX2T99Write_23Cyc_F01], (instrs FDIVv2f64)>;
+def : InstRW<[THX2T99Write_23Cyc_F01], (instregex "FDIVv2f64")>;
 
 // ASIMD FP max/min, normal, D-form
 // ASIMD FP max/min, normal, Q-form
@@ -540,20 +1423,24 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv",
 // ASIMD FP multiply, Q-form, FZ
 // ASIMD FP multiply, Q-form, no FZ
 def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01],
+            (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+def : InstRW<[THX2T99Write_6Cyc_F01],
+            (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
 
 // ASIMD FP multiply accumulate, Dform, FZ
 // ASIMD FP multiply accumulate, Dform, no FZ
 // ASIMD FP multiply accumulate, Qform, FZ
 // ASIMD FP multiply accumulate, Qform, no FZ
 def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01],
+            (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>;
+def : InstRW<[THX2T99Write_6Cyc_F01],
+            (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
 
 // ASIMD FP negate
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FNEGv")>;
 
-// ASIMD FP round, D-form
-// ASIMD FP round, Q-form
-// NOTE: Handled by WriteV.
-
 //--
 // 3.14 ASIMD Miscellaneous Instructions
 //--
@@ -563,37 +1450,66 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^RBITv")>;
 
 // ASIMD bitwise insert, D-form
 // ASIMD bitwise insert, Q-form
-def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^BIFv", "^BITv", "^BSLv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^BIFv", "^BITv", "^BSLv")>;
 
 // ASIMD count, D-form
 // ASIMD count, Q-form
-def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^CLSv", "^CLZv", "^CNTv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^CLSv", "^CLZv", "^CNTv")>;
 
 // ASIMD duplicate, gen reg
 // ASIMD duplicate, element
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^CPY")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv.+gpr")>;
 
 // ASIMD extract
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^EXTv")>;
 
 // ASIMD extract narrow
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^XTNv")>;
+
 // ASIMD extract narrow, saturating
-// NOTE: Handled by WriteV.
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^SQXTNv", "^SQXTUNv", "^UQXTNv")>;
 
 // ASIMD insert, element to element
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>;
 
+// ASIMD transfer, element to gen reg
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^[SU]MOVv")>;
+
 // ASIMD move, integer immed
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>;
 
 // ASIMD move, FP immed
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMOVv")>;
 
+// ASIMD table lookup, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8One")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Two")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Three")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Four")>;
+
+// ASIMD table lookup, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8One")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Two")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Three")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Four")>;
+
+// ASIMD transpose
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^TRN1", "^TRN2")>;
+
+// ASIMD unzip/zip
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^UZP1", "^UZP2", "^ZIP1", "^ZIP2")>;
+
 // ASIMD reciprocal estimate, D-form
 // ASIMD reciprocal estimate, Q-form
-def : InstRW<[THX2T99Write_5Cyc_F01], 
+def : InstRW<[THX2T99Write_5Cyc_F01],
             (instregex "^FRECPEv", "^FRECPXv", "^URECPEv",
-                                   "^FRSQRTEv", "^URSQRTEv")>;
+                       "^FRSQRTEv", "^URSQRTEv")>;
 
 // ASIMD reciprocal step, D-form, FZ
 // ASIMD reciprocal step, D-form, no FZ
@@ -602,7 +1518,7 @@ def : InstRW<[THX2T99Write_5Cyc_F01],
 def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>;
 
 // ASIMD reverse
-def : InstRW<[THX2T99Write_5Cyc_F01], 
+def : InstRW<[THX2T99Write_5Cyc_F01],
             (instregex "^REV16v", "^REV32v", "^REV64v")>;
 
 // ASIMD table lookup, D-form
@@ -610,135 +1526,135 @@ def : InstRW<[THX2T99Write_5Cyc_F01],
 def : InstRW<[THX2T99Write_8Cyc_F01], (instregex "^TBLv", "^TBXv")>;
 
 // ASIMD transfer, element to word or word
-def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^UMOVv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^[SU]MOVv")>;
 
 // ASIMD transfer, element to gen reg
-def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^SMOVv", "^UMOVv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "(S|U)MOVv.*")>;
 
 // ASIMD transfer gen reg to element
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>;
 
 // ASIMD transpose
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^TRN1v", "^TRN2v",
-                                                "^UZP1v", "^UZP2v")>;
+                                                 "^UZP1v", "^UZP2v")>;
 
 // ASIMD unzip/zip
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>;
 
 //--
-// 3.15 ASIMD Load Instructions 
+// 3.15 ASIMD Load Instructions
 //--
 
 // ASIMD load, 1 element, multiple, 1 reg, D-form
 // ASIMD load, 1 element, multiple, 1 reg, Q-form
-def : InstRW<[THX2T99Write_4Cyc_LS01], 
+def : InstRW<[THX2T99Write_4Cyc_LS01],
             (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr],
             (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 2 reg, D-form
 // ASIMD load, 1 element, multiple, 2 reg, Q-form
-def : InstRW<[THX2T99Write_4Cyc_LS01], 
+def : InstRW<[THX2T99Write_4Cyc_LS01],
             (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr],
             (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 3 reg, D-form
 // ASIMD load, 1 element, multiple, 3 reg, Q-form
-def : InstRW<[THX2T99Write_5Cyc_LS01], 
+def : InstRW<[THX2T99Write_5Cyc_LS01],
             (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_5Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01, WriteAdr],
             (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 4 reg, D-form
 // ASIMD load, 1 element, multiple, 4 reg, Q-form
-def : InstRW<[THX2T99Write_6Cyc_LS01], 
+def : InstRW<[THX2T99Write_6Cyc_LS01],
             (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_6Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_6Cyc_LS01, WriteAdr],
             (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, one lane, B/H/S
 // ASIMD load, 1 element, one lane, D
 def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>;
-def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
             (instregex "^LD1i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 1 element, all lanes, D-form, B/H/S
 // ASIMD load, 1 element, all lanes, D-form, D
 // ASIMD load, 1 element, all lanes, Q-form
-def : InstRW<[THX2T99Write_5Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01],
             (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
             (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 2 element, multiple, D-form, B/H/S
 // ASIMD load, 2 element, multiple, Q-form, D
-def : InstRW<[THX2T99Write_5Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01],
             (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
             (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 2 element, one lane, B/H
 // ASIMD load, 2 element, one lane, S
 // ASIMD load, 2 element, one lane, D
 def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>;
-def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
             (instregex "^LD2i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 2 element, all lanes, D-form, B/H/S
 // ASIMD load, 2 element, all lanes, D-form, D
 // ASIMD load, 2 element, all lanes, Q-form
-def : InstRW<[THX2T99Write_5Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01],
             (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
             (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 3 element, multiple, D-form, B/H/S
 // ASIMD load, 3 element, multiple, Q-form, B/H/S
 // ASIMD load, 3 element, multiple, Q-form, D
-def : InstRW<[THX2T99Write_8Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01],
             (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr],
             (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 3 element, one lone, B/H
 // ASIMD load, 3 element, one lane, S
 // ASIMD load, 3 element, one lane, D
 def : InstRW<[THX2T99Write_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>;
-def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr],
             (instregex "^LD3i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 3 element, all lanes, D-form, B/H/S
 // ASIMD load, 3 element, all lanes, D-form, D
 // ASIMD load, 3 element, all lanes, Q-form, B/H/S
 // ASIMD load, 3 element, all lanes, Q-form, D
-def : InstRW<[THX2T99Write_7Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01],
             (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr],
             (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 4 element, multiple, D-form, B/H/S
 // ASIMD load, 4 element, multiple, Q-form, B/H/S
 // ASIMD load, 4 element, multiple, Q-form, D
-def : InstRW<[THX2T99Write_8Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01],
             (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr],
             (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 4 element, one lane, B/H
 // ASIMD load, 4 element, one lane, S
 // ASIMD load, 4 element, one lane, D
 def : InstRW<[THX2T99Write_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>;
-def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr],
             (instregex "^LD4i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 4 element, all lanes, D-form, B/H/S
 // ASIMD load, 4 element, all lanes, D-form, D
 // ASIMD load, 4 element, all lanes, Q-form, B/H/S
 // ASIMD load, 4 element, all lanes, Q-form, D
-def : InstRW<[THX2T99Write_6Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01],
             (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr],
             (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 //--
@@ -747,106 +1663,83 @@ def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr],
 
 // ASIMD store, 1 element, multiple, 1 reg, D-form
 // ASIMD store, 1 element, multiple, 1 reg, Q-form
-def : InstRW<[THX2T99Write_1Cyc_LS01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01],
             (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr],
             (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 2 reg, D-form
 // ASIMD store, 1 element, multiple, 2 reg, Q-form
-def : InstRW<[THX2T99Write_1Cyc_LS01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01],
             (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr],
             (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 3 reg, D-form
 // ASIMD store, 1 element, multiple, 3 reg, Q-form
-def : InstRW<[THX2T99Write_1Cyc_LS01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01],
             (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr],
             (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 4 reg, D-form
 // ASIMD store, 1 element, multiple, 4 reg, Q-form
-def : InstRW<[THX2T99Write_1Cyc_LS01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01],
             (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr],
             (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, one lane, B/H/S
 // ASIMD store, 1 element, one lane, D
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
             (instregex "^ST1i(8|16|32|64)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
             (instregex "^ST1i(8|16|32|64)_POST$")>;
 
 // ASIMD store, 2 element, multiple, D-form, B/H/S
 // ASIMD store, 2 element, multiple, Q-form, B/H/S
 // ASIMD store, 2 element, multiple, Q-form, D
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
             (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
             (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 2 element, one lane, B/H/S
 // ASIMD store, 2 element, one lane, D
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
             (instregex "^ST2i(8|16|32|64)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
             (instregex "^ST2i(8|16|32|64)_POST$")>;
 
 // ASIMD store, 3 element, multiple, D-form, B/H/S
 // ASIMD store, 3 element, multiple, Q-form, B/H/S
 // ASIMD store, 3 element, multiple, Q-form, D
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
             (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
             (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 3 element, one lane, B/H
 // ASIMD store, 3 element, one lane, S
 // ASIMD store, 3 element, one lane, D
 def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
             (instregex "^ST3i(8|16|32|64)_POST$")>;
 
 // ASIMD store, 4 element, multiple, D-form, B/H/S
 // ASIMD store, 4 element, multiple, Q-form, B/H/S
 // ASIMD store, 4 element, multiple, Q-form, D
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
             (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
             (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 4 element, one lane, B/H
 // ASIMD store, 4 element, one lane, S
 // ASIMD store, 4 element, one lane, D
 def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>;
-def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
             (instregex "^ST4i(8|16|32|64)_POST$")>;
 
-//--
-// 3.17 Cryptography Extensions
-//--
-
-// Crypto AES ops
-def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^AES")>;
-
-// Crypto polynomial (64x64) multiply long
-def : InstRW<[THX2T99Write_5Cyc_F1], (instrs PMULLv1i64, PMULLv2i64)>;
-
-// Crypto SHA1 xor ops
-// Crypto SHA1 schedule acceleration ops
-// Crypto SHA256 schedule acceleration op (1 u-op)
-// Crypto SHA256 schedule acceleration op (2 u-ops)
-// Crypto SHA256 hash acceleration ops
-def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA")>;
-
-//--
-// 3.18 CRC
-//--
-
-// CRC checksum ops
-def : InstRW<[THX2T99Write_4Cyc_I1], (instregex "^CRC32")>;
-
 } // SchedModel = ThunderX2T99Model
+
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 6660f0babb8a..1252f9403812 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -167,6 +167,8 @@ extern "C" void LLVMInitializeAArch64Target() {
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   if (TT.isOSBinFormatMachO())
     return llvm::make_unique<AArch64_MachoTargetObjectFile>();
+  if (TT.isOSBinFormatCOFF())
+    return llvm::make_unique<AArch64_COFFTargetObjectFile>();
 
   return llvm::make_unique<AArch64_ELFTargetObjectFile>();
 }
@@ -179,6 +181,8 @@ static std::string computeDataLayout(const Triple &TT,
     return "e-m:e-p:32:32-i8:8-i16:16-i64:64-S128";
   if (TT.isOSBinFormatMachO())
     return "e-m:o-i64:64-i128:128-n32:64-S128";
+  if (TT.isOSBinFormatCOFF())
+    return "e-m:w-i64:64-i128:128-n32:64-S128";
   if (LittleEndian)
     return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
   return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index 2c75a3258c1c..fefa7e26b79f 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -36,6 +36,7 @@ public:
 
   ~AArch64TargetMachine() override;
   const AArch64Subtarget *getSubtargetImpl(const Function &F) const override;
+  const AArch64Subtarget *getSubtargetImpl() const = delete;
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
index 47e3bce43f6e..9077eb7902fd 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -45,6 +45,9 @@ public:
                          const TargetMachine &TM) const override;
 };
 
+/// This implementation is used for AArch64 COFF targets.
+class AArch64_COFFTargetObjectFile : public TargetLoweringObjectFileCOFF {};
+
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a4328682b93c..a76f080530bb 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -20,6 +20,23 @@ using namespace llvm;
 
 #define DEBUG_TYPE "aarch64tti"
 
+static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
+                                               cl::init(true), cl::Hidden);
+
+bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
+                                         const Function *Callee) const {
+  const TargetMachine &TM = getTLI()->getTargetMachine();
+
+  const FeatureBitset &CallerBits =
+      TM.getSubtargetImpl(*Caller)->getFeatureBits();
+  const FeatureBitset &CalleeBits =
+      TM.getSubtargetImpl(*Callee)->getFeatureBits();
+
+  // Inline a callee if its target-features are a subset of the callers
+  // target-features.
+  return (CallerBits & CalleeBits) == CalleeBits;
+}
+
 /// \brief Calculate the cost of materializing a 64-bit value. This helper
 /// method might only calculate a fraction of a larger immediate. Therefore it
 /// is valid to return a cost of ZERO.
@@ -631,10 +648,62 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
   return ST->getMaxInterleaveFactor();
 }
 
-void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
+// For Falkor, we want to avoid having too many strided loads in a loop since
+// that can exhaust the HW prefetcher resources.  We adjust the unroller
+// MaxCount preference below to attempt to ensure unrolling doesn't create too
+// many strided loads.
+static void
+getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                              TargetTransformInfo::UnrollingPreferences &UP) {
+  enum { MaxStridedLoads = 7 };
+  auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
+    int StridedLoads = 0;
+    // FIXME? We could make this more precise by looking at the CFG and
+    // e.g. not counting loads in each side of an if-then-else diamond.
+    for (const auto BB : L->blocks()) {
+      for (auto &I : *BB) {
+        LoadInst *LMemI = dyn_cast<LoadInst>(&I);
+        if (!LMemI)
+          continue;
+
+        Value *PtrValue = LMemI->getPointerOperand();
+        if (L->isLoopInvariant(PtrValue))
+          continue;
+
+        const SCEV *LSCEV = SE.getSCEV(PtrValue);
+        const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+        if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
+          continue;
+
+        // FIXME? We could take pairing of unrolled load copies into account
+        // by looking at the AddRec, but we would probably have to limit this
+        // to loops with no stores or other memory optimization barriers.
+        ++StridedLoads;
+        // We've seen enough strided loads that seeing more won't make a
+        // difference.
+        if (StridedLoads > MaxStridedLoads / 2)
+          return StridedLoads;
+      }
+    }
+    return StridedLoads;
+  };
+
+  int StridedLoads = countStridedLoads(L, SE);
+  DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
+               << " strided loads\n");
+  // Pick the largest power of 2 unroll count that won't result in too many
+  // strided loads.
+  if (StridedLoads) {
+    UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
+    DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount
+                 << '\n');
+  }
+}
+
+void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                              TTI::UnrollingPreferences &UP) {
   // Enable partial unrolling and runtime unrolling.
-  BaseT::getUnrollingPreferences(L, UP);
+  BaseT::getUnrollingPreferences(L, SE, UP);
 
   // For inner loop, it is more likely to be a hot one, and the runtime check
   // can be promoted out from LICM pass, so the overhead is less, let's try
@@ -644,6 +713,10 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
 
   // Disable partial & runtime unrolling on -Os.
   UP.PartialOptSizeThreshold = 0;
+
+  if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
+      EnableFalkorHWPFUnrollFix)
+    getFalkorUnrollingPreferences(L, SE, UP);
 }
 
 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 290a1ca1f24b..31c037354925 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -51,6 +51,9 @@ public:
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
+  bool areInlineCompatible(const Function *Caller,
+                           const Function *Callee) const;
+
   /// \name Scalar TTI Implementations
   /// @{
 
@@ -119,7 +122,8 @@ public:
 
   int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
 
-  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
 
   Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
                                            Type *ExpectedType);
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 3d075018904c..475f91016840 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -541,14 +541,13 @@ public:
     return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian, IsILP32);
   }
 
-  void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
-                         const MCValue &Target, bool &IsResolved) override;
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override;
 };
 
-void ELFAArch64AsmBackend::processFixupValue(const MCAssembler &Asm,
-                                             const MCFixup &Fixup,
-                                             const MCValue &Target,
-                                             bool &IsResolved) {
+bool ELFAArch64AsmBackend::shouldForceRelocation(const MCAssembler &Asm,
+                                                 const MCFixup &Fixup,
+                                                 const MCValue &Target) {
   // The ADRP instruction adds some multiple of 0x1000 to the current PC &
   // ~0xfff. This means that the required offset to reach a symbol can vary by
   // up to one step depending on where the ADRP is in memory. For example:
@@ -562,11 +561,24 @@ void ELFAArch64AsmBackend::processFixupValue(const MCAssembler &Asm,
   // section isn't 0x1000-aligned, we therefore need to delegate this decision
   // to the linker -- a relocation!
   if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21)
-    IsResolved = false;
+    return true;
+  return false;
 }
 
 }
 
+namespace {
+class COFFAArch64AsmBackend : public AArch64AsmBackend {
+public:
+  COFFAArch64AsmBackend(const Target &T, const Triple &TheTriple)
+      : AArch64AsmBackend(T, /*IsLittleEndian*/true) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createAArch64WinCOFFObjectWriter(OS);
+  }
+};
+}
+
 MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
                                               const MCRegisterInfo &MRI,
                                               const Triple &TheTriple,
@@ -575,7 +587,11 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
   if (TheTriple.isOSBinFormatMachO())
     return new DarwinAArch64AsmBackend(T, MRI);
 
-  assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
+  if (TheTriple.isOSBinFormatCOFF())
+    return new COFFAArch64AsmBackend(T, TheTriple);
+
+  assert(TheTriple.isOSBinFormatELF() && "Invalid target");
+
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
   bool IsILP32 = Options.getABIName() == "ilp32";
   return new ELFAArch64AsmBackend(T, OSABI, /*IsLittleEndian=*/true, IsILP32);
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index f7dda92fb551..89c3e5b4c76e 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -49,10 +49,11 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
                               /*HasRelocationAddend*/ true),
       IsILP32(IsILP32) {}
 
-#define R_CLS(rtype) \
-        IsILP32 ? ELF::R_AARCH64_P32_##rtype : ELF::R_AARCH64_##rtype
-#define BAD_ILP32_MOV(lp64rtype) "ILP32 absolute MOV relocation not "\
-        "supported (LP64 eqv: " #lp64rtype ")"
+#define R_CLS(rtype)                                                           \
+  IsILP32 ? ELF::R_AARCH64_P32_##rtype : ELF::R_AARCH64_##rtype
+#define BAD_ILP32_MOV(lp64rtype)                                               \
+  "ILP32 absolute MOV relocation not "                                         \
+  "supported (LP64 eqv: " #lp64rtype ")"
 
 // assumes IsILP32 is true
 static bool isNonILP32reloc(const MCFixup &Fixup,
@@ -60,44 +61,45 @@ static bool isNonILP32reloc(const MCFixup &Fixup,
                             MCContext &Ctx) {
   if ((unsigned)Fixup.getKind() != AArch64::fixup_aarch64_movw)
     return false;
-  switch(RefKind) {
-    case AArch64MCExpr::VK_ABS_G3:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G3));
-      return true;
-    case AArch64MCExpr::VK_ABS_G2:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2));
-      return true;
-    case AArch64MCExpr::VK_ABS_G2_S:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G2));
-      return true;
-    case AArch64MCExpr::VK_ABS_G2_NC:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2_NC));
-      return true;
-    case AArch64MCExpr::VK_ABS_G1_S:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G1));
-      return true;
-    case AArch64MCExpr::VK_ABS_G1_NC:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G1_NC));
-      return true;
-    case AArch64MCExpr::VK_DTPREL_G2:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G2));
-      return true;
-    case AArch64MCExpr::VK_DTPREL_G1_NC:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G1_NC));
-      return true;
-    case AArch64MCExpr::VK_TPREL_G2:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G2));
-      return true;
-    case AArch64MCExpr::VK_TPREL_G1_NC:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G1_NC));
-      return true;
-    case AArch64MCExpr::VK_GOTTPREL_G1:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G1));
-      return true;
-    case AArch64MCExpr::VK_GOTTPREL_G0_NC:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G0_NC));
-      return true;
-    default: return false;
+  switch (RefKind) {
+  case AArch64MCExpr::VK_ABS_G3:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G3));
+    return true;
+  case AArch64MCExpr::VK_ABS_G2:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2));
+    return true;
+  case AArch64MCExpr::VK_ABS_G2_S:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G2));
+    return true;
+  case AArch64MCExpr::VK_ABS_G2_NC:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2_NC));
+    return true;
+  case AArch64MCExpr::VK_ABS_G1_S:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G1));
+    return true;
+  case AArch64MCExpr::VK_ABS_G1_NC:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G1_NC));
+    return true;
+  case AArch64MCExpr::VK_DTPREL_G2:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G2));
+    return true;
+  case AArch64MCExpr::VK_DTPREL_G1_NC:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G1_NC));
+    return true;
+  case AArch64MCExpr::VK_TPREL_G2:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G2));
+    return true;
+  case AArch64MCExpr::VK_TPREL_G1_NC:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G1_NC));
+    return true;
+  case AArch64MCExpr::VK_GOTTPREL_G1:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G1));
+    return true;
+  case AArch64MCExpr::VK_GOTTPREL_G0_NC:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G0_NC));
+    return true;
+  default:
+    return false;
   }
   return false;
 }
@@ -130,7 +132,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
       return R_CLS(PREL32);
     case FK_Data_8:
       if (IsILP32) {
-        Ctx.reportError(Fixup.getLoc(), "ILP32 8 byte PC relative data "
+        Ctx.reportError(Fixup.getLoc(),
+                        "ILP32 8 byte PC relative data "
                         "relocation not supported (LP64 eqv: PREL64)");
         return ELF::R_AARCH64_NONE;
       } else
@@ -178,7 +181,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
     }
   } else {
     if (IsILP32 && isNonILP32reloc(Fixup, RefKind, Ctx))
-        return ELF::R_AARCH64_NONE;
+      return ELF::R_AARCH64_NONE;
     switch ((unsigned)Fixup.getKind()) {
     case FK_Data_1:
       Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
@@ -189,8 +192,9 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
       return R_CLS(ABS32);
     case FK_Data_8:
       if (IsILP32) {
-        Ctx.reportError(Fixup.getLoc(), "ILP32 8 byte absolute data "
-			"relocation not supported (LP64 eqv: ABS64)");
+        Ctx.reportError(Fixup.getLoc(),
+                        "ILP32 8 byte absolute data "
+                        "relocation not supported (LP64 eqv: ABS64)");
         return ELF::R_AARCH64_NONE;
       } else
         return ELF::R_AARCH64_ABS64;
@@ -262,7 +266,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
         } else {
           Ctx.reportError(Fixup.getLoc(),
                           "LP64 4 byte unchecked GOT load/store relocation "
-			  "not supported (ILP32 eqv: LD32_GOT_LO12_NC");
+                          "not supported (ILP32 eqv: LD32_GOT_LO12_NC");
           return ELF::R_AARCH64_NONE;
         }
       }
@@ -270,12 +274,12 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
         if (IsILP32) {
           Ctx.reportError(Fixup.getLoc(),
                           "ILP32 4 byte checked GOT load/store relocation "
-			  "not supported (unchecked eqv: LD32_GOT_LO12_NC)");
+                          "not supported (unchecked eqv: LD32_GOT_LO12_NC)");
         } else {
           Ctx.reportError(Fixup.getLoc(),
                           "LP64 4 byte checked GOT load/store relocation "
-			  "not supported (unchecked/ILP32 eqv: "
-			  "LD32_GOT_LO12_NC)");
+                          "not supported (unchecked/ILP32 eqv: "
+                          "LD32_GOT_LO12_NC)");
         }
         return ELF::R_AARCH64_NONE;
       }
@@ -283,7 +287,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
         if (IsILP32) {
           return ELF::R_AARCH64_P32_TLSIE_LD32_GOTTPREL_LO12_NC;
         } else {
-          Ctx.reportError(Fixup.getLoc(), "LP64 32-bit load/store "
+          Ctx.reportError(Fixup.getLoc(),
+                          "LP64 32-bit load/store "
                           "relocation not supported (ILP32 eqv: "
                           "TLSIE_LD32_GOTTPREL_LO12_NC)");
           return ELF::R_AARCH64_NONE;
@@ -295,14 +300,14 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
         } else {
           Ctx.reportError(Fixup.getLoc(),
                           "LP64 4 byte TLSDESC load/store relocation "
-			  "not supported (ILP32 eqv: TLSDESC_LD64_LO12)");
+                          "not supported (ILP32 eqv: TLSDESC_LD64_LO12)");
           return ELF::R_AARCH64_NONE;
         }
       }
 
       Ctx.reportError(Fixup.getLoc(),
                       "invalid fixup for 32-bit load/store instruction "
-		      "fixup_aarch64_ldst_imm12_scale4");
+                      "fixup_aarch64_ldst_imm12_scale4");
       return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_ldst_imm12_scale8:
       if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
@@ -312,8 +317,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
           return ELF::R_AARCH64_LD64_GOT_LO12_NC;
         } else {
           Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store "
-                          "relocation not supported (LP64 eqv: "
-                          "LD64_GOT_LO12_NC)");
+                                          "relocation not supported (LP64 eqv: "
+                                          "LD64_GOT_LO12_NC)");
           return ELF::R_AARCH64_NONE;
         }
       }
@@ -330,8 +335,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
           return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
         } else {
           Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store "
-                          "relocation not supported (LP64 eqv: "
-                          "TLSIE_LD64_GOTTPREL_LO12_NC)");
+                                          "relocation not supported (LP64 eqv: "
+                                          "TLSIE_LD64_GOTTPREL_LO12_NC)");
           return ELF::R_AARCH64_NONE;
         }
       }
@@ -340,8 +345,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
           return ELF::R_AARCH64_TLSDESC_LD64_LO12;
         } else {
           Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store "
-                          "relocation not supported (LP64 eqv: "
-                          "TLSDESC_LD64_LO12)");
+                                          "relocation not supported (LP64 eqv: "
+                                          "TLSDESC_LD64_LO12)");
           return ELF::R_AARCH64_NONE;
         }
       }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 031aa8b81e35..a0de3c39562b 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64TargetStreamer.h"
+#include "AArch64WinCOFFStreamer.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
@@ -30,6 +31,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
@@ -210,6 +212,8 @@ createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
   const Triple &TT = STI.getTargetTriple();
   if (TT.isOSBinFormatELF())
     return new AArch64TargetELFStreamer(S);
+  if (TT.isOSBinFormatCOFF())
+    return new AArch64TargetWinCOFFStreamer(S);
   return nullptr;
 }
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
index 0f5b765c7697..4293dcba955e 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -16,53 +16,47 @@ namespace llvm {
 namespace AArch64 {
 
 enum Fixups {
-  // fixup_aarch64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into
-  // an ADR instruction.
+  // A 21-bit pc-relative immediate inserted into an ADR instruction.
   fixup_aarch64_pcrel_adr_imm21 = FirstTargetFixupKind,
 
-  // fixup_aarch64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into
-  // an ADRP instruction.
+  // A 21-bit pc-relative immediate inserted into an ADRP instruction.
   fixup_aarch64_pcrel_adrp_imm21,
 
-  // fixup_aarch64_imm12 - 12-bit fixup for add/sub instructions.
-  //     No alignment adjustment. All value bits are encoded.
+  // 12-bit fixup for add/sub instructions. No alignment adjustment. All value
+  // bits are encoded.
   fixup_aarch64_add_imm12,
 
-  // fixup_aarch64_ldst_imm12_* - unsigned 12-bit fixups for load and
-  // store instructions.
+  // unsigned 12-bit fixups for load and store instructions.
   fixup_aarch64_ldst_imm12_scale1,
   fixup_aarch64_ldst_imm12_scale2,
   fixup_aarch64_ldst_imm12_scale4,
   fixup_aarch64_ldst_imm12_scale8,
   fixup_aarch64_ldst_imm12_scale16,
 
-  // fixup_aarch64_ldr_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative
-  // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is used by
-  // pc-relative loads and generates relocations directly when necessary.
+  // The high 19 bits of a 21-bit pc-relative immediate. Same encoding as
+  // fixup_aarch64_pcrel_adrhi, except this is used by pc-relative loads and
+  // generates relocations directly when necessary.
   fixup_aarch64_ldr_pcrel_imm19,
 
   // FIXME: comment
   fixup_aarch64_movw,
 
-  // fixup_aarch64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative
-  // immediate.
+  // The high 14 bits of a 21-bit pc-relative immediate.
   fixup_aarch64_pcrel_branch14,
 
-  // fixup_aarch64_pcrel_branch19 - The high 19 bits of a 21-bit pc-relative
-  // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is use by
-  // b.cc and generates relocations directly when necessary.
+  // The high 19 bits of a 21-bit pc-relative immediate. Same encoding as
+  // fixup_aarch64_pcrel_adrhi, except this is use by b.cc and generates
+  // relocations directly when necessary.
   fixup_aarch64_pcrel_branch19,
 
-  // fixup_aarch64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative
-  // immediate.
+  // The high 26 bits of a 28-bit pc-relative immediate.
   fixup_aarch64_pcrel_branch26,
 
-  // fixup_aarch64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative
-  // immediate. Distinguished from branch26 only on ELF.
+  // The high 26 bits of a 28-bit pc-relative immediate. Distinguished from
+  // branch26 only on ELF.
   fixup_aarch64_pcrel_call26,
 
-  // fixup_aarch64_tlsdesc_call - zero-space placeholder for the ELF
-  // R_AARCH64_TLSDESC_CALL relocation.
+  // zero-space placeholder for the ELF R_AARCH64_TLSDESC_CALL relocation.
   fixup_aarch64_tlsdesc_call,
 
   // Marker
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 1b28df963b40..fc808ee0cdd6 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -100,3 +100,7 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
 
   HasIdentDirective = true;
 }
+
+AArch64MCAsmInfoCOFF::AArch64MCAsmInfoCOFF() {
+  CommentString = ";";
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 253cd30f26ee..2d7107a37244 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 
+#include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmInfoELF.h"
 
@@ -33,6 +34,10 @@ struct AArch64MCAsmInfoELF : public MCAsmInfoELF {
   explicit AArch64MCAsmInfoELF(const Triple &T);
 };
 
+struct AArch64MCAsmInfoCOFF : public MCAsmInfoCOFF {
+  explicit AArch64MCAsmInfoCOFF();
+};
+
 } // namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index f710065d9bc7..a2555496cdb9 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -14,6 +14,7 @@
 #include "AArch64MCTargetDesc.h"
 #include "AArch64ELFStreamer.h"
 #include "AArch64MCAsmInfo.h"
+#include "AArch64WinCOFFStreamer.h"
 #include "InstPrinter/AArch64InstPrinter.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -59,8 +60,10 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
   MCAsmInfo *MAI;
   if (TheTriple.isOSBinFormatMachO())
     MAI = new AArch64MCAsmInfoDarwin();
+  else if (TheTriple.isOSBinFormatCOFF())
+    MAI = new AArch64MCAsmInfoCOFF();
   else {
-    assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
+    assert(TheTriple.isOSBinFormatELF() && "Invalid target");
     MAI = new AArch64MCAsmInfoELF(TheTriple);
   }
 
@@ -74,8 +77,8 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
 
 static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
                               CodeModel::Model &CM) {
-  assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()) &&
-         "Only expect Darwin and ELF targets");
+  assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO() ||
+          TT.isOSBinFormatCOFF()) && "Invalid target");
 
   if (CM == CodeModel::Default)
     CM = CodeModel::Small;
@@ -122,6 +125,14 @@ static MCStreamer *createMachOStreamer(MCContext &Ctx, MCAsmBackend &TAB,
                              /*LabelSections*/ true);
 }
 
+static MCStreamer *createWinCOFFStreamer(MCContext &Ctx, MCAsmBackend &TAB,
+                                         raw_pwrite_stream &OS,
+                                         MCCodeEmitter *Emitter, bool RelaxAll,
+                                         bool IncrementalLinkerCompatible) {
+  return createAArch64WinCOFFStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
+                                      IncrementalLinkerCompatible);
+}
+
 static MCInstrAnalysis *createAArch64InstrAnalysis(const MCInstrInfo *Info) {
   return new MCInstrAnalysis(Info);
 }
@@ -154,6 +165,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
     // Register the obj streamers.
     TargetRegistry::RegisterELFStreamer(*T, createELFStreamer);
     TargetRegistry::RegisterMachOStreamer(*T, createMachOStreamer);
+    TargetRegistry::RegisterCOFFStreamer(*T, createWinCOFFStreamer);
 
     // Register the obj target streamer.
     TargetRegistry::RegisterObjectTargetStreamer(
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 615d7dab2c51..1404926b8124 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -60,6 +60,8 @@ MCObjectWriter *createAArch64MachObjectWriter(raw_pwrite_stream &OS,
                                               uint32_t CPUType,
                                               uint32_t CPUSubtype);
 
+MCObjectWriter *createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS);
+
 MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
                                                  MCInstPrinter *InstPrint,
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
new file mode 100644
index 000000000000..7862a03e771c
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -0,0 +1,65 @@
+//= AArch64WinCOFFObjectWriter.cpp - AArch64 Windows COFF Object Writer C++ =//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
+using namespace llvm;
+
+namespace {
+
+class AArch64WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+public:
+  AArch64WinCOFFObjectWriter()
+    : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARM64) {
+  }
+
+  ~AArch64WinCOFFObjectWriter() override = default;
+
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsCrossSection,
+                        const MCAsmBackend &MAB) const override;
+
+   bool recordRelocation(const MCFixup &) const override;
+};
+
+} // end anonymous namespace
+
+unsigned
+AArch64WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
+                                         const MCValue &Target,
+                                         const MCFixup &Fixup,
+                                         bool IsCrossSection,
+                                         const MCAsmBackend &MAB) const {
+  const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind());
+  report_fatal_error(Twine("unsupported relocation type: ") + Info.Name);
+}
+
+bool AArch64WinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
+  return true;
+}
+
+namespace llvm {
+
+MCObjectWriter *createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS) {
+  MCWinCOFFObjectTargetWriter *MOTW = new AArch64WinCOFFObjectWriter();
+  return createWinCOFFObjectWriter(MOTW, OS);
+}
+
+} // end namespace llvm
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
new file mode 100644
index 000000000000..6c8da27e398f
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -0,0 +1,37 @@
+//===-- AArch64WinCOFFStreamer.cpp - ARM Target WinCOFF Streamer ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64WinCOFFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+
+class AArch64WinCOFFStreamer : public MCWinCOFFStreamer {
+public:
+  friend class AArch64TargetWinCOFFStreamer;
+
+  AArch64WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter &CE,
+                        raw_pwrite_stream &OS)
+      : MCWinCOFFStreamer(C, AB, CE, OS) {}
+};
+} // end anonymous namespace
+
+namespace llvm {
+MCWinCOFFStreamer
+*createAArch64WinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                              raw_pwrite_stream &OS,
+                              MCCodeEmitter *Emitter, bool RelaxAll,
+                              bool IncrementalLinkerCompatible) {
+  auto *S = new AArch64WinCOFFStreamer(Context, MAB, *Emitter, OS);
+  S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
+  return S;
+}
+
+} // end llvm namespace
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
new file mode 100644
index 000000000000..1b4fcd6804e2
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
@@ -0,0 +1,43 @@
+//===-- AArch64WinCOFFStreamer.h - WinCOFF Streamer for AArch64 -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements WinCOFF streamer information for the AArch64 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64WINCOFFSTREAMER_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64WINCOFFSTREAMER_H
+
+#include "AArch64TargetStreamer.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+
+namespace {
+class AArch64WinCOFFStreamer;
+
+class AArch64TargetWinCOFFStreamer : public llvm::AArch64TargetStreamer {
+private:
+  AArch64WinCOFFStreamer &getStreamer();
+
+public:
+  AArch64TargetWinCOFFStreamer(llvm::MCStreamer &S)
+    : AArch64TargetStreamer(S) {}
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+MCWinCOFFStreamer
+*createAArch64WinCOFFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                              raw_pwrite_stream &OS,
+                              MCCodeEmitter *Emitter, bool RelaxAll,
+                              bool IncrementalLinkerCompatible);
+} // end llvm namespace
+
+#endif
diff --git a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
index 6d8be5e63fbb..56eeba8a1d4b 100644
--- a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
@@ -8,6 +8,8 @@ add_llvm_library(LLVMAArch64Desc
   AArch64MCTargetDesc.cpp
   AArch64MachObjectWriter.cpp
   AArch64TargetStreamer.cpp
+  AArch64WinCOFFObjectWriter.cpp
+  AArch64WinCOFFStreamer.cpp
 )
 add_dependencies(LLVMAArch64Desc AArch64CommonTableGen)
 
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 55d18c3f3646..5a799b2d88d0 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -36,7 +36,6 @@ FunctionPass *createR600ControlFlowFinalizer();
 FunctionPass *createAMDGPUCFGStructurizerPass();
 
 // SI Passes
-FunctionPass *createSITypeRewriter();
 FunctionPass *createSIAnnotateControlFlowPass();
 FunctionPass *createSIFoldOperandsPass();
 FunctionPass *createSIPeepholeSDWAPass();
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 7494e5decd6f..f1d899c4d003 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -262,8 +262,8 @@ def FeatureSDWAMac : SubtargetFeature<"sdwa-mav",
   "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension"
 >;
 
-def FeatureSDWAClampVOPC : SubtargetFeature<"sdwa-clamp-vopc",
-  "HasSDWAClampVOPC",
+def FeatureSDWAOutModsVOPC : SubtargetFeature<"sdwa-out-mods-vopc",
+  "HasSDWAOutModsVOPC",
   "true",
   "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension"
 >;
@@ -452,7 +452,7 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
    FeatureScalarStores, FeatureInv2PiInlineImm,
-   FeatureSDWA, FeatureSDWAClampVOPC, FeatureSDWAMac, FeatureDPP
+   FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP
   ]
 >;
 
diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index 2071b6f157cd..9a391d06c9ea 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -1776,7 +1776,7 @@ static void removeExternalCFGEdges(MachineBasicBlock *StartMBB,
                                           E = EndMBB->succ_end();
          PI != E; ++PI) {
       // Either we have a back-edge to the entry block, or a back-edge to the
-      // succesor of the entry block since the block may be split.
+      // successor of the entry block since the block may be split.
       if ((*PI) != StartMBB &&
           !((*PI) == StartMBBSucc && StartMBB != EndMBB && SuccSize == 1)) {
         Succs.insert(
@@ -1831,7 +1831,7 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock(
   IfBB->addSuccessor(CodeBBStart);
 
   DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n");
-  // Ensure that the MergeBB is a succesor of the CodeEndBB.
+  // Ensure that the MergeBB is a successor of the CodeEndBB.
   if (!CodeBBEnd->isSuccessor(MergeBB))
     CodeBBEnd->addSuccessor(MergeBB);
 
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index ab5abf2039a5..be47b900c6f0 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -128,7 +128,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasSDWAScalar(false),
     HasSDWASdst(false),
     HasSDWAMac(false),
-    HasSDWAClampVOPC(false),
+    HasSDWAOutModsVOPC(false),
     HasDPP(false),
     FlatAddressSpace(false),
     FlatInstOffsets(false),
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 2b16289c723e..22cede59086a 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -153,7 +153,7 @@ protected:
   bool HasSDWAScalar;
   bool HasSDWASdst;
   bool HasSDWAMac;
-  bool HasSDWAClampVOPC;
+  bool HasSDWAOutModsVOPC;
   bool HasDPP;
   bool FlatAddressSpace;
   bool FlatInstOffsets;
@@ -452,8 +452,8 @@ public:
     return HasSDWAMac;
   }
 
-  bool hasSDWAClampVOPC() const {
-    return HasSDWAClampVOPC;
+  bool hasSDWAOutModsVOPC() const {
+    return HasSDWAOutModsVOPC;
   }
 
   /// \brief Returns the offset in bytes from the start of the input buffer
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 04fe9f689806..425fd35d47de 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -720,7 +720,6 @@ bool GCNPassConfig::addPreISel() {
     addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
   }
   addPass(createSinkingPass());
-  addPass(createSITypeRewriter());
   addPass(createAMDGPUAnnotateUniformValues());
   if (!LateCFGStructurize) {
     addPass(createSIAnnotateControlFlowPass());
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 88245b01683a..89a03902dc69 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -63,7 +63,7 @@ static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
   return false;
 }
 
-void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
+void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                             TTI::UnrollingPreferences &UP) {
   UP.Threshold = 300; // Twice the default.
   UP.MaxCount = UINT_MAX;
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 485e20411ab4..9a320bdfcc3d 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -68,7 +68,8 @@ public:
 
   bool hasBranchDivergence() { return true; }
 
-  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
 
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
     assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index e30844f082cd..917d9cfa6905 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -96,7 +96,6 @@ add_llvm_target(AMDGPUCodeGen
   SIPeepholeSDWA.cpp
   SIRegisterInfo.cpp
   SIShrinkInstructions.cpp
-  SITypeRewriter.cpp
   SIWholeQuadMode.cpp
   GCNIterativeScheduler.cpp
   GCNMinRegStrategy.cpp
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 04308fb3aaf6..f26e49295e69 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -626,7 +626,9 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
   using namespace AMDGPU::SDWA;
 
   if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
-    if (SDWA9EncValues::SRC_VGPR_MIN <= Val &&
+    // XXX: static_cast<int> is needed to avoid stupid warning:
+    // compare with unsigned is always true
+    if (SDWA9EncValues::SRC_VGPR_MIN <= static_cast<int>(Val) &&
         Val <= SDWA9EncValues::SRC_VGPR_MAX) {
       return createRegOperand(getVgprClassId(Width),
                               Val - SDWA9EncValues::SRC_VGPR_MIN);
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index d0f4e00994de..d39b345bdf03 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4314,6 +4314,23 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(
   return SDValue();
 }
 
+// Returns true if argument is a boolean value which is not serialized into
+// memory or argument and does not require v_cmdmask_b32 to be deserialized.
+static bool isBoolSGPR(SDValue V) {
+  if (V.getValueType() != MVT::i1)
+    return false;
+  switch (V.getOpcode()) {
+  default: break;
+  case ISD::SETCC:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case AMDGPUISD::FP_CLASS:
+    return true;
+  }
+  return false;
+}
+
 SDValue SITargetLowering::performAndCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   if (DCI.isBeforeLegalize())
@@ -4402,6 +4419,16 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
     }
   }
 
+  if (VT == MVT::i32 &&
+      (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
+    // and x, (sext cc from i1) => select cc, x, 0
+    if (RHS.getOpcode() != ISD::SIGN_EXTEND)
+      std::swap(LHS, RHS);
+    if (isBoolSGPR(RHS.getOperand(0)))
+      return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
+                           LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
+  }
+
   return SDValue();
 }
 
@@ -4941,8 +4968,7 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
   case ISD::SIGN_EXTEND:
   case ISD::ANY_EXTEND: {
     auto Cond = RHS.getOperand(0);
-    if (Cond.getOpcode() != ISD::SETCC &&
-        Cond.getOpcode() != AMDGPUISD::FP_CLASS)
+    if (!isBoolSGPR(Cond))
       break;
     SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
     SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
@@ -5109,6 +5135,35 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   EVT VT = LHS.getValueType();
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+
+  auto CRHS = dyn_cast<ConstantSDNode>(RHS);
+  if (!CRHS) {
+    CRHS = dyn_cast<ConstantSDNode>(LHS);
+    if (CRHS) {
+      std::swap(LHS, RHS);
+      CC = getSetCCSwappedOperands(CC);
+    }
+  }
+
+  if (CRHS && VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
+      isBoolSGPR(LHS.getOperand(0))) {
+    // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
+    // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
+    // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
+    // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
+    if ((CRHS->isAllOnesValue() &&
+         (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
+        (CRHS->isNullValue() &&
+         (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
+      return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
+                         DAG.getConstant(-1, SL, MVT::i1));
+    if ((CRHS->isAllOnesValue() &&
+         (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
+        (CRHS->isNullValue() &&
+         (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
+      return LHS.getOperand(0);
+  }
 
   if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
                                            VT != MVT::f16))
@@ -5116,7 +5171,6 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
 
   // Match isinf pattern
   // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
-  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
     const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
     if (!CRHS)
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index c9b48fea7225..b6784ec14e9f 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -770,7 +770,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
     if (ST.hasScalarStores()) {
       // m0 is used for offset to scalar stores if used to spill.
-      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
+      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
     }
 
     return;
@@ -871,7 +871,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 
     if (ST.hasScalarStores()) {
       // m0 is used for offset to scalar stores if used to spill.
-      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
+      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
     }
 
     return;
@@ -2444,8 +2444,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
 
     int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
-    if ( DstIdx == -1)
-      DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::sdst);
 
     const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
 
@@ -2488,14 +2486,20 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
           ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
           return false;
         }
-      } else if (!ST.hasSDWAClampVOPC()) {
+      } else if (!ST.hasSDWAOutModsVOPC()) {
         // No clamp allowed on GFX9 for VOPC
         const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
-        if (Clamp != nullptr &&
-          (!Clamp->isImm() || Clamp->getImm() != 0)) {
+        if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
           ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
           return false;
         }
+
+        // No omod allowed on GFX9 for VOPC
+        const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
+        if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
+          ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
+          return false;
+        }
       }
     }
   }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 3b4a8b5d1e81..4a81fb3b463a 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -336,6 +336,10 @@ def NegSubInlineConst16 : ImmLeaf<i16, [{
   return Imm < -16 && Imm >= -64;
 }], NegateImm>;
 
+def ShiftAmt32Imm : PatLeaf <(imm), [{
+  return N->getZExtValue() < 32;
+}]>;
+
 //===----------------------------------------------------------------------===//
 // Custom Operands
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 3b4bdc864253..bcc685015cf5 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -929,6 +929,14 @@ def : UMad24Pat<V_MAD_U32_U24>;
 defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
 def : ROTRPattern <V_ALIGNBIT_B32>;
 
+def : Pat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
+          (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
+                          (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
+
+def : Pat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
+          (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
+                          (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
+
 /********** ====================== **********/
 /**********   Indirect addressing  **********/
 /********** ====================== **********/
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 4ac23ef03cb3..e2ac6631d2f3 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -627,10 +627,13 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
         return false;
     }
 
-    if (!ST.hasSDWAClampVOPC() && TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
+    if (!ST.hasSDWAOutModsVOPC() &&
+        (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
+         TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))
       return false;
 
-  } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
+  } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) ||
+             !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
     return false;
   }
 
@@ -649,25 +652,24 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
     SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(MI.getOpcode()));
   assert(SDWAOpcode != -1);
 
-  // Copy dst, if it is present in original then should also be present in SDWA
-  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
-  if (!Dst && !TII->isVOPC(MI))
-    return false;
-
   const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
 
   // Create SDWA version of instruction MI and initialize its operands
   MachineInstrBuilder SDWAInst =
     BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
 
+  // Copy dst, if it is present in original then should also be present in SDWA
+  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
   if (Dst) {
     assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
     SDWAInst.add(*Dst);
-  } else {
-    Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+  } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) {
     assert(Dst &&
            AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
     SDWAInst.add(*Dst);
+  } else {
+    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
+    SDWAInst.addReg(AMDGPU::VCC, RegState::Define);
   }
 
   // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
@@ -714,20 +716,22 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
   }
 
   // Copy omod if present, initialize otherwise if needed
-  MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
-  if (OMod) {
-    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1);
-    SDWAInst.add(*OMod);
-  } else if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
-    SDWAInst.addImm(0);
+  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
+    MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
+    if (OMod) {
+      SDWAInst.add(*OMod);
+    } else {
+      SDWAInst.addImm(0);
+    }
   }
 
-  // Initialize dst_sel and dst_unused if present
-  if (Dst) {
-    assert(
-      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1 &&
-      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1);
+  // Initialize dst_sel if present
+  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) {
     SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+  }
+
+  // Initialize dst_unused if present
+  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) {
     SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
   }
 
diff --git a/lib/Target/AMDGPU/SITypeRewriter.cpp b/lib/Target/AMDGPU/SITypeRewriter.cpp
deleted file mode 100644
index aad68537f779..000000000000
--- a/lib/Target/AMDGPU/SITypeRewriter.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass removes performs the following type substitution on all
-/// non-compute shaders:
-///
-/// v16i8 => i128
-///   - v16i8 is used for constant memory resource descriptors.  This type is
-///      legal for some compute APIs, and we don't want to declare it as legal
-///      in the backend, because we want the legalizer to expand all v16i8
-///      operations.
-/// v1* => *
-///   - Having v1* types complicates the legalizer and we can easily replace
-///   - them with the element type.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
-
-using namespace llvm;
-
-namespace {
-
-class SITypeRewriter : public FunctionPass,
-                       public InstVisitor<SITypeRewriter> {
-
-  static char ID;
-  Module *Mod;
-  Type *v16i8;
-  Type *v4i32;
-
-public:
-  SITypeRewriter() : FunctionPass(ID) { }
-  bool doInitialization(Module &M) override;
-  bool runOnFunction(Function &F) override;
-  StringRef getPassName() const override { return "SI Type Rewriter"; }
-  void visitLoadInst(LoadInst &I);
-  void visitCallInst(CallInst &I);
-  void visitBitCast(BitCastInst &I);
-};
-
-} // End anonymous namespace
-
-char SITypeRewriter::ID = 0;
-
-bool SITypeRewriter::doInitialization(Module &M) {
-  Mod = &M;
-  v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16);
-  v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4);
-  return false;
-}
-
-bool SITypeRewriter::runOnFunction(Function &F) {
-  if (!AMDGPU::isShader(F.getCallingConv()))
-    return false;
-
-  visit(F);
-  visit(F);
-
-  return false;
-}
-
-void SITypeRewriter::visitLoadInst(LoadInst &I) {
-  Value *Ptr = I.getPointerOperand();
-  Type *PtrTy = Ptr->getType();
-  Type *ElemTy = PtrTy->getPointerElementType();
-  IRBuilder<> Builder(&I);
-  if (ElemTy == v16i8)  {
-    Value *BitCast = Builder.CreateBitCast(Ptr,
-        PointerType::get(v4i32,PtrTy->getPointerAddressSpace()));
-    LoadInst *Load = Builder.CreateLoad(BitCast);
-    SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
-    I.getAllMetadataOtherThanDebugLoc(MD);
-    for (unsigned i = 0, e = MD.size(); i != e; ++i) {
-      Load->setMetadata(MD[i].first, MD[i].second);
-    }
-    Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType());
-    I.replaceAllUsesWith(BitCastLoad);
-    I.eraseFromParent();
-  }
-}
-
-void SITypeRewriter::visitCallInst(CallInst &I) {
-  IRBuilder<> Builder(&I);
-
-  SmallVector <Value*, 8> Args;
-  SmallVector <Type*, 8> Types;
-  bool NeedToReplace = false;
-  Function *F = I.getCalledFunction();
-  if (!F)
-    return;
-
-  std::string Name = F->getName();
-  for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
-    Value *Arg = I.getArgOperand(i);
-    if (Arg->getType() == v16i8) {
-      Args.push_back(Builder.CreateBitCast(Arg, v4i32));
-      Types.push_back(v4i32);
-      NeedToReplace = true;
-      Name = Name + ".v4i32";
-    } else if (Arg->getType()->isVectorTy() &&
-               Arg->getType()->getVectorNumElements() == 1 &&
-               Arg->getType()->getVectorElementType() ==
-                                              Type::getInt32Ty(I.getContext())){
-      Type *ElementTy = Arg->getType()->getVectorElementType();
-      std::string TypeName = "i32";
-      InsertElementInst *Def = cast<InsertElementInst>(Arg);
-      Args.push_back(Def->getOperand(1));
-      Types.push_back(ElementTy);
-      std::string VecTypeName = "v1" + TypeName;
-      Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName);
-      NeedToReplace = true;
-    } else {
-      Args.push_back(Arg);
-      Types.push_back(Arg->getType());
-    }
-  }
-
-  if (!NeedToReplace) {
-    return;
-  }
-  Function *NewF = Mod->getFunction(Name);
-  if (!NewF) {
-    NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod);
-    NewF->setAttributes(F->getAttributes());
-  }
-  I.replaceAllUsesWith(Builder.CreateCall(NewF, Args));
-  I.eraseFromParent();
-}
-
-void SITypeRewriter::visitBitCast(BitCastInst &I) {
-  IRBuilder<> Builder(&I);
-  if (I.getDestTy() != v4i32) {
-    return;
-  }
-
-  if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) {
-    if (Op->getSrcTy() == v4i32) {
-      I.replaceAllUsesWith(Op->getOperand(0));
-      I.eraseFromParent();
-    }
-  }
-}
-
-FunctionPass *llvm::createSITypeRewriter() {
-  return new SITypeRewriter();
-}
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 6f67183df6a1..c40b4450a5b5 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -222,6 +222,13 @@ def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
 def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true",
                                      "Has return address stack">;
 
+// Some processors have no branch predictor, which changes the expected cost of
+// taking a branch which affects the choice of whether to use predicated
+// instructions.
+def FeatureHasNoBranchPredictor : SubtargetFeature<"no-branch-predictor",
+                                                   "HasBranchPredictor", "false",
+                                                   "Has no branch predictor">;
+
 /// DSP extension.
 def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true",
                               "Supports DSP instructions in ARM and/or Thumb2">;
@@ -262,6 +269,10 @@ def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true",
                                         "Generate calls via indirect call "
                                         "instructions">;
 
+def FeatureExecuteOnly
+    : SubtargetFeature<"execute-only", "GenExecuteOnly", "true",
+                       "Enable the generation of execute only code.">;
+
 def FeatureReserveR9 : SubtargetFeature<"reserve-r9", "ReserveR9", "true",
                                         "Reserve R9, making it unavailable as "
                                         "GPR">;
@@ -540,7 +551,7 @@ def ARMv7s   : Architecture<"armv7s",      "ARMv7a",   [ARMv7a]>;
 //
 
 // Dummy CPU, used to target architectures
-def : ProcNoItin<"generic",                             []>;
+def : ProcessorModel<"generic",     CortexA8Model,      []>;
 
 def : ProcNoItin<"arm8",                                [ARMv4]>;
 def : ProcNoItin<"arm810",                              [ARMv4]>;
@@ -756,13 +767,19 @@ def : ProcessorModel<"cortex-r8",   CortexA8Model,      [ARMv7r,
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureAvoidPartialCPSR]>;
 
-def : ProcNoItin<"cortex-m3",                           [ARMv7m, ProcM3]>;
-def : ProcNoItin<"sc300",                               [ARMv7m, ProcM3]>;
+def : ProcessorModel<"cortex-m3", CortexM3Model,        [ARMv7m,
+                                                         ProcM3,
+                                                         FeatureHasNoBranchPredictor]>;
+
+def : ProcessorModel<"sc300",     CortexM3Model,        [ARMv7m,
+                                                         ProcM3,
+                                                         FeatureHasNoBranchPredictor]>;
 
-def : ProcNoItin<"cortex-m4",                           [ARMv7em,
+def : ProcessorModel<"cortex-m4", CortexM3Model,        [ARMv7em,
                                                          FeatureVFP4,
                                                          FeatureVFPOnlySP,
-                                                         FeatureD16]>;
+                                                         FeatureD16,
+                                                         FeatureHasNoBranchPredictor]>;
 
 def : ProcNoItin<"cortex-m7",                           [ARMv7em,
                                                          FeatureFPARMv8,
@@ -771,11 +788,12 @@ def : ProcNoItin<"cortex-m7",                           [ARMv7em,
 def : ProcNoItin<"cortex-m23",                          [ARMv8mBaseline,
                                                          FeatureNoMovt]>;
 
-def : ProcNoItin<"cortex-m33",                          [ARMv8mMainline,
+def : ProcessorModel<"cortex-m33", CortexM3Model,       [ARMv8mMainline,
                                                          FeatureDSP,
                                                          FeatureFPARMv8,
                                                          FeatureD16,
-                                                         FeatureVFPOnlySP]>;
+                                                         FeatureVFPOnlySP,
+                                                         FeatureHasNoBranchPredictor]>;
 
 def : ProcNoItin<"cortex-a32",                           [ARMv8a,
                                                          FeatureHWDivThumb,
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index e0810c358f2d..1ec6b24b2ed6 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1851,9 +1851,9 @@ isProfitableToIfCvt(MachineBasicBlock &MBB,
 }
 
 bool ARMBaseInstrInfo::
-isProfitableToIfCvt(MachineBasicBlock &,
+isProfitableToIfCvt(MachineBasicBlock &TBB,
                     unsigned TCycles, unsigned TExtra,
-                    MachineBasicBlock &,
+                    MachineBasicBlock &FBB,
                     unsigned FCycles, unsigned FExtra,
                     BranchProbability Probability) const {
   if (!TCycles)
@@ -1863,14 +1863,43 @@ isProfitableToIfCvt(MachineBasicBlock &,
   // Here we scale up each component of UnpredCost to avoid precision issue when
   // scaling TCycles/FCycles by Probability.
   const unsigned ScalingUpFactor = 1024;
-  unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
-  unsigned FUnpredCost =
+
+  unsigned PredCost = (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor;
+  unsigned UnpredCost;
+  if (!Subtarget.hasBranchPredictor()) {
+    // When we don't have a branch predictor it's always cheaper to not take a
+    // branch than take it, so we have to take that into account.
+    unsigned NotTakenBranchCost = 1;
+    unsigned TakenBranchCost = Subtarget.getMispredictionPenalty();
+    unsigned TUnpredCycles, FUnpredCycles;
+    if (!FCycles) {
+      // Triangle: TBB is the fallthrough
+      TUnpredCycles = TCycles + NotTakenBranchCost;
+      FUnpredCycles = TakenBranchCost;
+    } else {
+      // Diamond: TBB is the block that is branched to, FBB is the fallthrough
+      TUnpredCycles = TCycles + TakenBranchCost;
+      FUnpredCycles = FCycles + NotTakenBranchCost;
+    }
+    // The total cost is the cost of each path scaled by their probabilites
+    unsigned TUnpredCost = Probability.scale(TUnpredCycles * ScalingUpFactor);
+    unsigned FUnpredCost = Probability.getCompl().scale(FUnpredCycles * ScalingUpFactor);
+    UnpredCost = TUnpredCost + FUnpredCost;
+    // When predicating assume that the first IT can be folded away but later
+    // ones cost one cycle each
+    if (Subtarget.isThumb2() && TCycles + FCycles > 4) {
+      PredCost += ((TCycles + FCycles - 4) / 4) * ScalingUpFactor;
+    }
+  } else {
+    unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
+    unsigned FUnpredCost =
       Probability.getCompl().scale(FCycles * ScalingUpFactor);
-  unsigned UnpredCost = TUnpredCost + FUnpredCost;
-  UnpredCost += 1 * ScalingUpFactor; // The branch itself
-  UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
+    UnpredCost = TUnpredCost + FUnpredCost;
+    UnpredCost += 1 * ScalingUpFactor; // The branch itself
+    UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
+  }
 
-  return (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor <= UnpredCost;
+  return PredCost <= UnpredCost;
 }
 
 bool
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 2bcc707e9fc3..e42514acd76f 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -7580,6 +7580,9 @@ static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
   SDValue VHi = DAG.getAnyExtOrTrunc(
       DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
       dl, MVT::i32);
+  bool isBigEndian = DAG.getDataLayout().isBigEndian();
+  if (isBigEndian)
+    std::swap (VLo, VHi);
   SDValue RegClass =
       DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
   SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
@@ -7607,10 +7610,14 @@ static void ReplaceCMP_SWAP_64Results(SDNode *N,
   MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
 
-  Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_0, SDLoc(N), MVT::i32,
-                                               SDValue(CmpSwap, 0)));
-  Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_1, SDLoc(N), MVT::i32,
-                                               SDValue(CmpSwap, 0)));
+  bool isBigEndian = DAG.getDataLayout().isBigEndian();
+
+  Results.push_back(
+      DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
+                                 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
+  Results.push_back(
+      DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
+                                 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
   Results.push_back(SDValue(CmpSwap, 2));
 }
 
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 423f97ccacd6..891a8f482f0a 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -1416,12 +1416,12 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
 let Size = 2, isBranch = 1, isTerminator = 1, isBarrier = 1,
     isIndirectBranch = 1 in {
 def tTBB_JT : tPseudoInst<(outs),
-        (ins tGPR:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>,
-        Sched<[WriteBr]>;
+        (ins tGPRwithpc:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0,
+         IIC_Br, []>, Sched<[WriteBr]>;
 
 def tTBH_JT : tPseudoInst<(outs),
-        (ins tGPR:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>,
-        Sched<[WriteBr]>;
+        (ins tGPRwithpc:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0,
+         IIC_Br, []>,  Sched<[WriteBr]>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 4cb0eca5ee5f..374176d1d737 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -46,6 +46,10 @@ private:
                   MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
                   const RegisterBankInfo &RBI) const;
 
+  bool selectSelect(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII,
+                    MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
+                    const RegisterBankInfo &RBI) const;
+
   const ARMBaseInstrInfo &TII;
   const ARMBaseRegisterInfo &TRI;
   const ARMBaseTargetMachine &TM;
@@ -346,6 +350,50 @@ bool ARMInstructionSelector::selectICmp(MachineInstrBuilder &MIB,
   return true;
 }
 
+bool ARMInstructionSelector::selectSelect(MachineInstrBuilder &MIB,
+                                          const ARMBaseInstrInfo &TII,
+                                          MachineRegisterInfo &MRI,
+                                          const TargetRegisterInfo &TRI,
+                                          const RegisterBankInfo &RBI) const {
+  auto &MBB = *MIB->getParent();
+  auto InsertBefore = std::next(MIB->getIterator());
+  auto &DebugLoc = MIB->getDebugLoc();
+
+  // Compare the condition to 0.
+  auto CondReg = MIB->getOperand(1).getReg();
+  assert(MRI.getType(CondReg).getSizeInBits() == 1 &&
+         RBI.getRegBank(CondReg, MRI, TRI)->getID() == ARM::GPRRegBankID &&
+         "Unsupported types for select operation");
+  auto CmpI = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::CMPri))
+                  .addUse(CondReg)
+                  .addImm(0)
+                  .add(predOps(ARMCC::AL));
+  if (!constrainSelectedInstRegOperands(*CmpI, TII, TRI, RBI))
+    return false;
+
+  // Move a value into the result register based on the result of the
+  // comparison.
+  auto ResReg = MIB->getOperand(0).getReg();
+  auto TrueReg = MIB->getOperand(2).getReg();
+  auto FalseReg = MIB->getOperand(3).getReg();
+  assert(MRI.getType(ResReg) == MRI.getType(TrueReg) &&
+         MRI.getType(TrueReg) == MRI.getType(FalseReg) &&
+         MRI.getType(FalseReg).getSizeInBits() == 32 &&
+         RBI.getRegBank(TrueReg, MRI, TRI)->getID() == ARM::GPRRegBankID &&
+         RBI.getRegBank(FalseReg, MRI, TRI)->getID() == ARM::GPRRegBankID &&
+         "Unsupported types for select operation");
+  auto Mov1I = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::MOVCCr))
+                   .addDef(ResReg)
+                   .addUse(TrueReg)
+                   .addUse(FalseReg)
+                   .add(predOps(ARMCC::EQ, ARM::CPSR));
+  if (!constrainSelectedInstRegOperands(*Mov1I, TII, TRI, RBI))
+    return false;
+
+  MIB->eraseFromParent();
+  return true;
+}
+
 bool ARMInstructionSelector::select(MachineInstr &I) const {
   assert(I.getParent() && "Instruction should be in a basic block!");
   assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -448,6 +496,8 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
   }
   case G_ICMP:
     return selectICmp(MIB, TII, MRI, TRI, RBI);
+  case G_SELECT:
+    return selectSelect(MIB, TII, MRI, TRI, RBI);
   case G_GEP:
     I.setDesc(TII.get(ARM::ADDrr));
     MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index 5873c7fb3872..f3e62d09cc30 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -55,10 +55,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
 
   for (unsigned Op : {G_SDIV, G_UDIV}) {
     for (auto Ty : {s8, s16})
-      // FIXME: We need WidenScalar here, but in the case of targets with
-      // software division we'll also need Libcall afterwards. Treat as Custom
-      // until we have better support for chaining legalization actions.
-      setAction({Op, Ty}, Custom);
+      setAction({Op, Ty}, WidenScalar);
     if (ST.hasDivideInARMMode())
       setAction({Op, s32}, Legal);
     else
@@ -84,6 +81,10 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
   setAction({G_GEP, p0}, Legal);
   setAction({G_GEP, 1, s32}, Legal);
 
+  setAction({G_SELECT, s32}, Legal);
+  setAction({G_SELECT, p0}, Legal);
+  setAction({G_SELECT, 1, s1}, Legal);
+
   setAction({G_CONSTANT, s32}, Legal);
 
   setAction({G_ICMP, s1}, Legal);
@@ -118,40 +119,6 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
   switch (MI.getOpcode()) {
   default:
     return false;
-  case G_SDIV:
-  case G_UDIV: {
-    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
-    if (Ty != LLT::scalar(16) && Ty != LLT::scalar(8))
-      return false;
-
-    // We need to widen to 32 bits and then maybe, if the target requires,
-    // transform into a libcall.
-    LegalizerHelper Helper(MIRBuilder.getMF());
-
-    MachineInstr *NewMI = nullptr;
-    Helper.MIRBuilder.recordInsertions([&](MachineInstr *MI) {
-      // Store the new, 32-bit div instruction.
-      if (MI->getOpcode() == G_SDIV || MI->getOpcode() == G_UDIV)
-        NewMI = MI;
-    });
-
-    auto Result = Helper.widenScalar(MI, 0, LLT::scalar(32));
-    Helper.MIRBuilder.stopRecordingInsertions();
-    if (Result == LegalizerHelper::UnableToLegalize) {
-      return false;
-    }
-    assert(NewMI && "Couldn't find widened instruction");
-    assert((NewMI->getOpcode() == G_SDIV || NewMI->getOpcode() == G_UDIV) &&
-           "Unexpected widened instruction");
-    assert(MRI.getType(NewMI->getOperand(0).getReg()).getSizeInBits() == 32 &&
-           "Unexpected type for the widened instruction");
-
-    Result = Helper.legalizeInstrStep(*NewMI);
-    if (Result == LegalizerHelper::UnableToLegalize) {
-      return false;
-    }
-    return true;
-  }
   case G_SREM:
   case G_UREM: {
     unsigned OriginalResult = MI.getOperand(0).getReg();
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 2350d0c6ef69..11fb81a4f9fe 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -255,6 +255,18 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     OperandsMapping =
         getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr});
     break;
+  case G_SELECT: {
+    LLT Ty2 = MRI.getType(MI.getOperand(1).getReg());
+    (void)Ty2;
+    assert(Ty.getSizeInBits() == 32 && "Unsupported size for G_SELECT");
+    assert(Ty2.getSizeInBits() == 1 && "Unsupported size for G_SELECT");
+    OperandsMapping =
+        getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx],
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx],
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx],
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx]});
+    break;
+  }
   case G_ICMP: {
     LLT Ty2 = MRI.getType(MI.getOperand(2).getReg());
     (void)Ty2;
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 02cbfb1fa9f1..b10583bc7983 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -245,6 +245,10 @@ def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> {
 // the general GPR register class above (MOV, e.g.)
 def tGPR : RegisterClass<"ARM", [i32], 32, (trunc GPR, 8)>;
 
+// Thumb registers R0-R7 and the PC. Some instructions like TBB or THH allow
+// the PC to be used as a destination operand as well.
+def tGPRwithpc : RegisterClass<"ARM", [i32], 32, (add tGPR, PC)>;
+
 // The high registers in thumb mode, R8-R15.
 def hGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, tGPR)>;
 
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index 1c7902520f2d..53e012f13ee2 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -424,3 +424,4 @@ include "ARMScheduleA9.td"
 include "ARMScheduleSwift.td"
 include "ARMScheduleR52.td"
 include "ARMScheduleA57.td"
+include "ARMScheduleM3.td"
diff --git a/lib/Target/ARM/ARMScheduleM3.td b/lib/Target/ARM/ARMScheduleM3.td
new file mode 100644
index 000000000000..93f8299f9bd0
--- /dev/null
+++ b/lib/Target/ARM/ARMScheduleM3.td
@@ -0,0 +1,21 @@
+//=- ARMScheduleM3.td - ARM Cortex-M3 Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the ARM Cortex-M3 processor.
+//
+//===----------------------------------------------------------------------===//
+
+def CortexM3Model : SchedMachineModel {
+  let IssueWidth        = 1; // Only IT can be dual-issued, so assume single-issue
+  let MicroOpBufferSize = 0; // In-order
+  let LoadLatency       = 2; // Latency when not pipelined, not pc-relative
+  let MispredictPenalty = 2; // Best case branch taken cost
+
+  let CompleteModel = 0;
+}
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index d9d0c27c6304..2c42a1336166 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -11,6 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ARM.h"
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "ARMCallLowering.h"
+#include "ARMLegalizerInfo.h"
+#include "ARMRegisterBankInfo.h"
+#endif
 #include "ARMSubtarget.h"
 #include "ARMFrameLowering.h"
 #include "ARMInstrInfo.h"
@@ -23,6 +30,13 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#endif
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
@@ -78,11 +92,6 @@ ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU,
   return *this;
 }
 
-/// EnableExecuteOnly - Enables the generation of execute-only code on supported
-/// targets
-static cl::opt<bool>
-EnableExecuteOnly("arm-execute-only");
-
 ARMFrameLowering *ARMSubtarget::initializeFrameLowering(StringRef CPU,
                                                         StringRef FS) {
   ARMSubtarget &STI = initializeSubtargetDependencies(CPU, FS);
@@ -92,13 +101,41 @@ ARMFrameLowering *ARMSubtarget::initializeFrameLowering(StringRef CPU,
   return new ARMFrameLowering(STI);
 }
 
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+
+struct ARMGISelActualAccessor : public GISelAccessor {
+  std::unique_ptr<CallLowering> CallLoweringInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+
+  const CallLowering *getCallLowering() const override {
+    return CallLoweringInfo.get();
+  }
+
+  const InstructionSelector *getInstructionSelector() const override {
+    return InstSelector.get();
+  }
+
+  const LegalizerInfo *getLegalizerInfo() const override {
+    return Legalizer.get();
+  }
+
+  const RegisterBankInfo *getRegBankInfo() const override {
+    return RegBankInfo.get();
+  }
+};
+
+} // end anonymous namespace
+#endif
+
 ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
                            const std::string &FS,
                            const ARMBaseTargetMachine &TM, bool IsLittle)
     : ARMGenSubtargetInfo(TT, CPU, FS), UseMulOps(UseFusedMulOps),
-      GenExecuteOnly(EnableExecuteOnly), CPUString(CPU), IsLittle(IsLittle),
-      TargetTriple(TT), Options(TM.Options), TM(TM),
-      FrameLowering(initializeFrameLowering(CPU, FS)),
+      CPUString(CPU), IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options),
+      TM(TM), FrameLowering(initializeFrameLowering(CPU, FS)),
       // At this point initializeSubtargetDependencies has been called so
       // we can query directly.
       InstrInfo(isThumb1Only()
@@ -106,7 +143,29 @@ ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
                     : !isThumb()
                           ? (ARMBaseInstrInfo *)new ARMInstrInfo(*this)
                           : (ARMBaseInstrInfo *)new Thumb2InstrInfo(*this)),
-      TLInfo(TM, *this) {}
+      TLInfo(TM, *this) {
+  assert((isThumb() || hasARMOps()) &&
+         "Target must either be thumb or support ARM operations!");
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+  GISelAccessor *GISel = new GISelAccessor();
+#else
+  ARMGISelActualAccessor *GISel = new ARMGISelActualAccessor();
+  GISel->CallLoweringInfo.reset(new ARMCallLowering(*getTargetLowering()));
+  GISel->Legalizer.reset(new ARMLegalizerInfo(*this));
+
+  auto *RBI = new ARMRegisterBankInfo(*getRegisterInfo());
+
+  // FIXME: At this point, we can't rely on Subtarget having RBI.
+  // It's awkward to mix passing RBI and the Subtarget; should we pass
+  // TII/TRI as well?
+  GISel->InstSelector.reset(createARMInstructionSelector(
+      *static_cast<const ARMBaseTargetMachine *>(&TM), *this, *RBI));
+
+  GISel->RegBankInfo.reset(RBI);
+#endif
+  setGISelAccessor(*GISel);
+}
 
 const CallLowering *ARMSubtarget::getCallLowering() const {
   assert(GISel && "Access to GlobalISel APIs not set");
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index d890d0fa777e..e15b17512c96 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -246,6 +246,11 @@ protected:
   /// avoid issue "normal" call instructions to callees which do not return.
   bool HasRetAddrStack = false;
 
+  /// HasBranchPredictor - True if the subtarget has a branch predictor. Having
+  /// a branch predictor or not changes the expected cost of taking a branch
+  /// which affects the choice of whether to use predicated instructions.
+  bool HasBranchPredictor = true;
+
   /// HasMPExtension - True if the subtarget supports Multiprocessing
   /// extension (ARMv7 only).
   bool HasMPExtension = false;
@@ -554,6 +559,7 @@ public:
   bool cheapPredicableCPSRDef() const { return CheapPredicableCPSRDef; }
   bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
   bool hasRetAddrStack() const { return HasRetAddrStack; }
+  bool hasBranchPredictor() const { return HasBranchPredictor; }
   bool hasMPExtension() const { return HasMPExtension; }
   bool hasDSP() const { return HasDSP; }
   bool useNaClTrap() const { return UseNaClTrap; }
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index eb71e557ec91..c323a1d368de 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -11,11 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
-#include "ARMCallLowering.h"
-#include "ARMLegalizerInfo.h"
-#ifdef LLVM_BUILD_GLOBAL_ISEL
-#include "ARMRegisterBankInfo.h"
-#endif
 #include "ARMSubtarget.h"
 #include "ARMMacroFusion.h"
 #include "ARMTargetMachine.h"
@@ -29,7 +24,6 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/ExecutionDepsFix.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
-#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
@@ -110,60 +104,20 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
 static ARMBaseTargetMachine::ARMABI
 computeTargetABI(const Triple &TT, StringRef CPU,
                  const TargetOptions &Options) {
-  if (Options.MCOptions.getABIName() == "aapcs16")
+  StringRef ABIName = Options.MCOptions.getABIName();
+
+  if (ABIName.empty())
+    ABIName = ARM::computeDefaultTargetABI(TT, CPU);
+
+  if (ABIName == "aapcs16")
     return ARMBaseTargetMachine::ARM_ABI_AAPCS16;
-  else if (Options.MCOptions.getABIName().startswith("aapcs"))
+  else if (ABIName.startswith("aapcs"))
     return ARMBaseTargetMachine::ARM_ABI_AAPCS;
-  else if (Options.MCOptions.getABIName().startswith("apcs"))
+  else if (ABIName.startswith("apcs"))
     return ARMBaseTargetMachine::ARM_ABI_APCS;
 
-  assert(Options.MCOptions.getABIName().empty() &&
-         "Unknown target-abi option!");
-
-  ARMBaseTargetMachine::ARMABI TargetABI =
-      ARMBaseTargetMachine::ARM_ABI_UNKNOWN;
-
-  unsigned ArchKind = ARM::parseCPUArch(CPU);
-  StringRef ArchName = ARM::getArchName(ArchKind);
-  // FIXME: This is duplicated code from the front end and should be unified.
-  if (TT.isOSBinFormatMachO()) {
-    if (TT.getEnvironment() == Triple::EABI ||
-        (TT.getOS() == Triple::UnknownOS && TT.isOSBinFormatMachO()) ||
-        ARM::parseArchProfile(ArchName) == ARM::PK_M) {
-      TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
-    } else if (TT.isWatchABI()) {
-      TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS16;
-    } else {
-      TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
-    }
-  } else if (TT.isOSWindows()) {
-    // FIXME: this is invalid for WindowsCE
-    TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
-  } else {
-    // Select the default based on the platform.
-    switch (TT.getEnvironment()) {
-    case Triple::Android:
-    case Triple::GNUEABI:
-    case Triple::GNUEABIHF:
-    case Triple::MuslEABI:
-    case Triple::MuslEABIHF:
-    case Triple::EABIHF:
-    case Triple::EABI:
-      TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
-      break;
-    case Triple::GNU:
-      TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
-      break;
-    default:
-      if (TT.isOSNetBSD())
-        TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
-      else
-        TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
-      break;
-    }
-  }
-
-  return TargetABI;
+  llvm_unreachable("Unhandled/unknown ABI Name!");
+  return ARMBaseTargetMachine::ARM_ABI_UNKNOWN;
 }
 
 static std::string computeDataLayout(const Triple &TT, StringRef CPU,
@@ -248,61 +202,39 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
                         CPU, FS, Options, getEffectiveRelocModel(TT, RM), CM,
                         OL),
       TargetABI(computeTargetABI(TT, CPU, Options)),
-      TLOF(createTLOF(getTargetTriple())),
-      Subtarget(TT, CPU, FS, *this, isLittle), isLittle(isLittle) {
+      TLOF(createTLOF(getTargetTriple())), isLittle(isLittle) {
 
   // Default to triple-appropriate float ABI
-  if (Options.FloatABIType == FloatABI::Default)
-    this->Options.FloatABIType =
-        Subtarget.isTargetHardFloat() ? FloatABI::Hard : FloatABI::Soft;
+  if (Options.FloatABIType == FloatABI::Default) {
+    if (TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+        TargetTriple.getEnvironment() == Triple::MuslEABIHF ||
+        TargetTriple.getEnvironment() == Triple::EABIHF ||
+        TargetTriple.isOSWindows() ||
+        TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16)
+      this->Options.FloatABIType = FloatABI::Hard;
+    else
+      this->Options.FloatABIType = FloatABI::Soft;
+  }
 
   // Default to triple-appropriate EABI
   if (Options.EABIVersion == EABI::Default ||
       Options.EABIVersion == EABI::Unknown) {
     // musl is compatible with glibc with regard to EABI version
-    if (Subtarget.isTargetGNUAEABI() || Subtarget.isTargetMuslAEABI())
+    if ((TargetTriple.getEnvironment() == Triple::GNUEABI ||
+	 TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+	 TargetTriple.getEnvironment() == Triple::MuslEABI ||
+	 TargetTriple.getEnvironment() == Triple::MuslEABIHF) &&
+	!(TargetTriple.isOSWindows() || TargetTriple.isOSDarwin()))
       this->Options.EABIVersion = EABI::GNU;
     else
       this->Options.EABIVersion = EABI::EABI5;
   }
 
   initAsmInfo();
-  if (!Subtarget.isThumb() && !Subtarget.hasARMOps())
-    report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "
-                       "support ARM mode execution!");
 }
 
 ARMBaseTargetMachine::~ARMBaseTargetMachine() = default;
 
-#ifdef LLVM_BUILD_GLOBAL_ISEL
-namespace {
-
-struct ARMGISelActualAccessor : public GISelAccessor {
-  std::unique_ptr<CallLowering> CallLoweringInfo;
-  std::unique_ptr<InstructionSelector> InstSelector;
-  std::unique_ptr<LegalizerInfo> Legalizer;
-  std::unique_ptr<RegisterBankInfo> RegBankInfo;
-
-  const CallLowering *getCallLowering() const override {
-    return CallLoweringInfo.get();
-  }
-
-  const InstructionSelector *getInstructionSelector() const override {
-    return InstSelector.get();
-  }
-
-  const LegalizerInfo *getLegalizerInfo() const override {
-    return Legalizer.get();
-  }
-
-  const RegisterBankInfo *getRegBankInfo() const override {
-    return RegBankInfo.get();
-  }
-};
-
-} // end anonymous namespace
-#endif
-
 const ARMSubtarget *
 ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
@@ -334,24 +266,6 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
     // function that reside in TargetOptions.
     resetTargetOptions(F);
     I = llvm::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle);
-
-#ifndef LLVM_BUILD_GLOBAL_ISEL
-    GISelAccessor *GISel = new GISelAccessor();
-#else
-    ARMGISelActualAccessor *GISel = new ARMGISelActualAccessor();
-    GISel->CallLoweringInfo.reset(new ARMCallLowering(*I->getTargetLowering()));
-    GISel->Legalizer.reset(new ARMLegalizerInfo(*I));
-
-    auto *RBI = new ARMRegisterBankInfo(*I->getRegisterInfo());
-
-    // FIXME: At this point, we can't rely on Subtarget having RBI.
-    // It's awkward to mix passing RBI and the Subtarget; should we pass
-    // TII/TRI as well?
-    GISel->InstSelector.reset(createARMInstructionSelector(*this, *I, *RBI));
-
-    GISel->RegBankInfo.reset(RBI);
-#endif
-    I->setGISelAccessor(*GISel);
   }
   return I.get();
 }
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index 2fcee73228fe..f41da3e8e223 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -36,7 +36,6 @@ public:
 
 protected:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
-  ARMSubtarget Subtarget;
   bool isLittle;
   mutable StringMap<std::unique_ptr<ARMSubtarget>> SubtargetMap;
 
@@ -47,8 +46,8 @@ public:
                        CodeGenOpt::Level OL, bool isLittle);
   ~ARMBaseTargetMachine() override;
 
-  const ARMSubtarget *getSubtargetImpl() const { return &Subtarget; }
   const ARMSubtarget *getSubtargetImpl(const Function &F) const override;
+  const ARMSubtarget *getSubtargetImpl() const = delete;
   bool isLittleEndian() const { return isLittle; }
 
   /// \brief Get the TargetIRAnalysis for this target.
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index a5b27abeb27f..88bab64ffaf2 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -32,7 +32,7 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
                                         const TargetMachine &TM) {
   const ARMBaseTargetMachine &ARM_TM = static_cast<const ARMBaseTargetMachine &>(TM);
   bool isAAPCS_ABI = ARM_TM.TargetABI == ARMBaseTargetMachine::ARMABI::ARM_ABI_AAPCS;
-  genExecuteOnly = ARM_TM.getSubtargetImpl()->genExecuteOnly();
+  //  genExecuteOnly = ARM_TM.getSubtargetImpl()->genExecuteOnly();
 
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
   InitializeELF(isAAPCS_ABI);
@@ -43,16 +43,6 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
 
   AttributesSection =
       getContext().getELFSection(".ARM.attributes", ELF::SHT_ARM_ATTRIBUTES, 0);
-
-  // Make code section unreadable when in execute-only mode
-  if (genExecuteOnly) {
-    unsigned  Type = ELF::SHT_PROGBITS;
-    unsigned Flags = ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_ARM_PURECODE;
-    // Since we cannot modify flags for an existing section, we create a new
-    // section with the right flags, and use 0 as the unique ID for
-    // execute-only text
-    TextSection = Ctx.getELFSection(".text", Type, Flags, 0, "", 0U);
-  }
 }
 
 const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference(
@@ -74,21 +64,27 @@ getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
                                  getContext());
 }
 
-MCSection *
-ARMElfTargetObjectFile::getExplicitSectionGlobal(const GlobalObject *GO,
-                                                 SectionKind SK, const TargetMachine &TM) const {
+static bool isExecuteOnlyFunction(const GlobalObject *GO, SectionKind SK,
+                                  const TargetMachine &TM) {
+  if (const Function *F = dyn_cast<Function>(GO))
+    if (TM.getSubtarget<ARMSubtarget>(*F).genExecuteOnly() && SK.isText())
+      return true;
+  return false;
+}
+
+MCSection *ARMElfTargetObjectFile::getExplicitSectionGlobal(
+    const GlobalObject *GO, SectionKind SK, const TargetMachine &TM) const {
   // Set execute-only access for the explicit section
-  if (genExecuteOnly && SK.isText())
+  if (isExecuteOnlyFunction(GO, SK, TM))
     SK = SectionKind::getExecuteOnly();
 
   return TargetLoweringObjectFileELF::getExplicitSectionGlobal(GO, SK, TM);
 }
 
-MCSection *
-ARMElfTargetObjectFile::SelectSectionForGlobal(const GlobalObject *GO,
-                                               SectionKind SK, const TargetMachine &TM) const {
+MCSection *ARMElfTargetObjectFile::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind SK, const TargetMachine &TM) const {
   // Place the global in the execute-only text section
-  if (genExecuteOnly && SK.isText())
+  if (isExecuteOnlyFunction(GO, SK, TM))
     SK = SectionKind::getExecuteOnly();
 
   return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, SK, TM);
diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h
index dbb8128269dc..bd7aa1cfe02b 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.h
+++ b/lib/Target/ARM/ARMTargetObjectFile.h
@@ -16,8 +16,6 @@
 namespace llvm {
 
 class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF {
-  mutable bool genExecuteOnly = false;
-
 protected:
   const MCSection *AttributesSection = nullptr;
 
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 585726208a8d..5ab236b7fd4c 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -486,7 +486,7 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     }
   }
 
-  Size = 0;
+  Size = 4;
   return MCDisassembler::Fail;
 }
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 81760f03940a..22de728fe06e 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -738,13 +738,13 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
   }
 }
 
-void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
-                                      const MCFixup &Fixup,
-                                      const MCValue &Target, bool &IsResolved) {
+bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
+                                          const MCFixup &Fixup,
+                                          const MCValue &Target) {
   const MCSymbolRefExpr *A = Target.getSymA();
   const MCSymbol *Sym = A ? &A->getSymbol() : nullptr;
   const unsigned FixupKind = Fixup.getKind() ;
-  if (IsResolved && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
+  if ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
     assert(Sym && "How did we resolve this?");
 
     // If the symbol is external the linker will handle it.
@@ -753,7 +753,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
     // If the symbol is out of range, produce a relocation and hope the
     // linker can handle it. GNU AS produces an error in this case.
     if (Sym->isExternal())
-      IsResolved = false;
+      return true;
   }
   // Create relocations for unconditional branches to function symbols with
   // different execution mode in ELF binaries.
@@ -761,12 +761,12 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
     unsigned Type = dyn_cast<MCSymbolELF>(Sym)->getType();
     if ((Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC)) {
       if (Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_uncondbranch))
-        IsResolved = false;
+        return true;
       if (!Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_thumb_br ||
                                     FixupKind == ARM::fixup_arm_thumb_bl ||
                                     FixupKind == ARM::fixup_t2_condbranch ||
                                     FixupKind == ARM::fixup_t2_uncondbranch))
-        IsResolved = false;
+        return true;
     }
   }
   // We must always generate a relocation for BL/BLX instructions if we have
@@ -776,7 +776,8 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
             FixupKind == ARM::fixup_arm_blx ||
             FixupKind == ARM::fixup_arm_uncondbl ||
             FixupKind == ARM::fixup_arm_condbl))
-    IsResolved = false;
+    return true;
+  return false;
 }
 
 /// getFixupKindNumBytes - The number of bytes the fixup may change.
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 6a0ba2ed41c1..84b54bbb9a49 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -38,10 +38,8 @@ public:
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 
-  /// processFixupValue - Target hook to process the literal value of a fixup
-  /// if necessary.
-  void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
-                         const MCValue &Target, bool &IsResolved) override;
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override;
 
   unsigned adjustFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
                             const MCValue &Target, uint64_t Value, bool IsPCRel,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
index 9f6c5d7bf920..831589ba0581 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
@@ -15,55 +15,47 @@
 namespace llvm {
 namespace ARM {
 enum Fixups {
-  // fixup_arm_ldst_pcrel_12 - 12-bit PC relative relocation for symbol
-  // addresses
+  // 12-bit PC relative relocation for symbol addresses
   fixup_arm_ldst_pcrel_12 = FirstTargetFixupKind,
 
-  // fixup_t2_ldst_pcrel_12 - Equivalent to fixup_arm_ldst_pcrel_12, with
-  // the 16-bit halfwords reordered.
+  // Equivalent to fixup_arm_ldst_pcrel_12, with the 16-bit halfwords reordered.
   fixup_t2_ldst_pcrel_12,
 
-  // fixup_arm_pcrel_10_unscaled - 10-bit PC relative relocation for symbol
-  // addresses used in LDRD/LDRH/LDRB/etc. instructions. All bits are encoded.
+  // 10-bit PC relative relocation for symbol addresses used in
+  // LDRD/LDRH/LDRB/etc. instructions. All bits are encoded.
   fixup_arm_pcrel_10_unscaled,
-  // fixup_arm_pcrel_10 - 10-bit PC relative relocation for symbol addresses
-  // used in VFP instructions where the lower 2 bits are not encoded
-  // (so it's encoded as an 8-bit immediate).
+  // 10-bit PC relative relocation for symbol addresses used in VFP instructions
+  // where the lower 2 bits are not encoded (so it's encoded as an 8-bit
+  // immediate).
   fixup_arm_pcrel_10,
-  // fixup_t2_pcrel_10 - Equivalent to fixup_arm_pcrel_10, accounting for
-  // the short-swapped encoding of Thumb2 instructions.
+  // Equivalent to fixup_arm_pcrel_10, accounting for the short-swapped encoding
+  // of Thumb2 instructions.
   fixup_t2_pcrel_10,
-  // fixup_arm_pcrel_9 - 9-bit PC relative relocation for symbol addresses
-  // used in VFP instructions where bit 0 not encoded (so it's encoded as an
-  // 8-bit immediate).
+  // 9-bit PC relative relocation for symbol addresses used in VFP instructions
+  // where bit 0 not encoded (so it's encoded as an 8-bit immediate).
   fixup_arm_pcrel_9,
-  // fixup_t2_pcrel_9 - Equivalent to fixup_arm_pcrel_9, accounting for
-  // the short-swapped encoding of Thumb2 instructions.
+  // Equivalent to fixup_arm_pcrel_9, accounting for the short-swapped encoding
+  // of Thumb2 instructions.
   fixup_t2_pcrel_9,
-  // fixup_thumb_adr_pcrel_10 - 10-bit PC relative relocation for symbol
-  // addresses where the lower 2 bits are not encoded (so it's encoded as an
-  // 8-bit immediate).
+  // 10-bit PC relative relocation for symbol addresses where the lower 2 bits
+  // are not encoded (so it's encoded as an 8-bit immediate).
   fixup_thumb_adr_pcrel_10,
-  // fixup_arm_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
-  // instruction.
+  // 12-bit PC relative relocation for the ADR instruction.
   fixup_arm_adr_pcrel_12,
-  // fixup_t2_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
-  // instruction.
+  // 12-bit PC relative relocation for the ADR instruction.
   fixup_t2_adr_pcrel_12,
-  // fixup_arm_condbranch - 24-bit PC relative relocation for conditional branch
-  // instructions. 
+  // 24-bit PC relative relocation for conditional branch instructions.
   fixup_arm_condbranch,
-  // fixup_arm_uncondbranch - 24-bit PC relative relocation for 
-  // branch instructions. (unconditional)
+  // 24-bit PC relative relocation for branch instructions. (unconditional)
   fixup_arm_uncondbranch,
-  // fixup_t2_condbranch - 20-bit PC relative relocation for Thumb2 direct
-  // uconditional branch instructions.
+  // 20-bit PC relative relocation for Thumb2 direct uconditional branch
+  // instructions.
   fixup_t2_condbranch,
-  // fixup_t2_uncondbranch - 20-bit PC relative relocation for Thumb2 direct
-  // branch unconditional branch instructions.
+  // 20-bit PC relative relocation for Thumb2 direct branch unconditional branch
+  // instructions.
   fixup_t2_uncondbranch,
 
-  // fixup_arm_thumb_br - 12-bit fixup for Thumb B instructions.
+  // 12-bit fixup for Thumb B instructions.
   fixup_arm_thumb_br,
 
   // The following fixups handle the ARM BL instructions. These can be
@@ -75,42 +67,41 @@ enum Fixups {
   // MachO does not draw a distinction between the two cases, so it will treat
   // fixup_arm_uncondbl and fixup_arm_condbl as identical fixups.
 
-  // fixup_arm_uncondbl - Fixup for unconditional ARM BL instructions.
+  // Fixup for unconditional ARM BL instructions.
   fixup_arm_uncondbl,
 
-  // fixup_arm_condbl - Fixup for ARM BL instructions with nontrivial
-  // conditionalisation.
+  // Fixup for ARM BL instructions with nontrivial conditionalisation.
   fixup_arm_condbl,
 
-  // fixup_arm_blx - Fixup for ARM BLX instructions.
+  // Fixup for ARM BLX instructions.
   fixup_arm_blx,
 
-  // fixup_arm_thumb_bl - Fixup for Thumb BL instructions.
+  // Fixup for Thumb BL instructions.
   fixup_arm_thumb_bl,
 
-  // fixup_arm_thumb_blx - Fixup for Thumb BLX instructions.
+  // Fixup for Thumb BLX instructions.
   fixup_arm_thumb_blx,
 
-  // fixup_arm_thumb_cb - Fixup for Thumb branch instructions.
+  // Fixup for Thumb branch instructions.
   fixup_arm_thumb_cb,
 
-  // fixup_arm_thumb_cp - Fixup for Thumb load/store from constant pool instrs.
+  // Fixup for Thumb load/store from constant pool instrs.
   fixup_arm_thumb_cp,
 
-  // fixup_arm_thumb_bcc - Fixup for Thumb conditional branching instructions.
+  // Fixup for Thumb conditional branching instructions.
   fixup_arm_thumb_bcc,
 
   // The next two are for the movt/movw pair
   // the 16bit imm field are split into imm{15-12} and imm{11-0}
   fixup_arm_movt_hi16, // :upper16:
   fixup_arm_movw_lo16, // :lower16:
-  fixup_t2_movt_hi16, // :upper16:
-  fixup_t2_movw_lo16, // :lower16:
+  fixup_t2_movt_hi16,  // :upper16:
+  fixup_t2_movw_lo16,  // :lower16:
 
-  // fixup_arm_mod_imm - Fixup for mod_imm
+  // Fixup for mod_imm
   fixup_arm_mod_imm,
 
-  // fixup_t2_so_imm - Fixup for Thumb2 8-bit rotated operand
+  // Fixup for Thumb2 8-bit rotated operand
   fixup_t2_so_imm,
 
   // Marker
@@ -118,6 +109,6 @@ enum Fixups {
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
 };
 }
-}
+} // namespace llvm
 
 #endif
diff --git a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index 5c3b45ac2328..d18298385adf 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -230,13 +230,25 @@ void ms8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
 namespace llvm {
 
 // Prepare value for the target space for it
-void AVRAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t &Value,
+void AVRAsmBackend::adjustFixupValue(const MCFixup &Fixup,
+                                     const MCValue &Target,
+                                     uint64_t &Value,
                                      MCContext *Ctx) const {
   // The size of the fixup in bits.
   uint64_t Size = AVRAsmBackend::getFixupKindInfo(Fixup.getKind()).TargetSize;
 
   unsigned Kind = Fixup.getKind();
 
+  // Parsed LLVM-generated temporary labels are already
+  // adjusted for instruction size, but normal labels aren't.
+  //
+  // To handle both cases, we simply un-adjust the temporary label
+  // case so it acts like all other labels.
+  if (const MCSymbolRefExpr *A = Target.getSymA()) {
+    if (A->getSymbol().isTemporary())
+      Value += 2;
+  }
+
   switch (Kind) {
   default:
     llvm_unreachable("unhandled fixup");
@@ -333,9 +345,10 @@ MCObjectWriter *AVRAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
                                   MCELFObjectTargetWriter::getOSABI(OSType));
 }
 
-void AVRAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                               unsigned DataSize, uint64_t Value,
-                               bool IsPCRel, MCContext &Ctx) const {
+void AVRAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                               const MCValue &Target, MutableArrayRef<char> Data,
+                               uint64_t Value, bool IsPCRel) const {
+  adjustFixupValue(Fixup, Target, Value, &Asm.getContext());
   if (Value == 0)
     return; // Doesn't change encoding.
 
@@ -349,7 +362,7 @@ void AVRAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
   Value <<= Info.TargetOffset;
 
   unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
 
   // For each byte of the fragment that the fixup touches, mask in the
   // bits from the fixup value.
@@ -436,30 +449,16 @@ bool AVRAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   return true;
 }
 
-void AVRAsmBackend::processFixupValue(const MCAssembler &Asm,
-                                      const MCAsmLayout &Layout,
-                                      const MCFixup &Fixup,
-                                      const MCFragment *DF,
-                                      const MCValue &Target, uint64_t &Value,
-                                      bool &IsResolved) {
+bool AVRAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
+                                          const MCFixup &Fixup,
+                                          const MCValue &Target) {
   switch ((unsigned) Fixup.getKind()) {
+  default: return false;
   // Fixups which should always be recorded as relocations.
   case AVR::fixup_7_pcrel:
   case AVR::fixup_13_pcrel:
   case AVR::fixup_call:
-    IsResolved = false;
-    break;
-  default:
-    // Parsed LLVM-generated temporary labels are already
-    // adjusted for instruction size, but normal labels aren't.
-    //
-    // To handle both cases, we simply un-adjust the temporary label
-    // case so it acts like all other labels.
-    if (Target.getSymA()->getSymbol().isTemporary())
-      Value += 2;
-
-    adjustFixupValue(Fixup, Value, &Asm.getContext());
-    break;
+    return true;
   }
 }
 
diff --git a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
index f2be2494684a..4a75e3b0d22d 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
+++ b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
@@ -35,13 +35,14 @@ public:
   AVRAsmBackend(Triple::OSType OSType)
       : MCAsmBackend(), OSType(OSType) {}
 
-  void adjustFixupValue(const MCFixup &Fixup, uint64_t &Value,
-                        MCContext *Ctx = nullptr) const;
+  void adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
+                        uint64_t &Value, MCContext *Ctx = nullptr) const;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 
@@ -63,10 +64,8 @@ public:
 
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override;
 
 private:
   Triple::OSType OSType;
diff --git a/lib/Target/BPF/BPFISelDAGToDAG.cpp b/lib/Target/BPF/BPFISelDAGToDAG.cpp
index c6ddd6bdad5e..f48429ee57b0 100644
--- a/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -16,6 +16,7 @@
 #include "BPFRegisterInfo.h"
 #include "BPFSubtarget.h"
 #include "BPFTargetMachine.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -57,6 +58,11 @@ private:
   bool SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset);
   bool SelectFIAddr(SDValue Addr, SDValue &Base, SDValue &Offset);
 
+  // Node preprocessing cases
+  void PreprocessLoad(SDNode *Node, SelectionDAG::allnodes_iterator I);
+  void PreprocessCopyToReg(SDNode *Node);
+  void PreprocessTrunc(SDNode *Node, SelectionDAG::allnodes_iterator I);
+
   // Find constants from a constant structure
   typedef std::vector<unsigned char> val_vec_type;
   bool fillGenericConstant(const DataLayout &DL, const Constant *CV,
@@ -69,9 +75,12 @@ private:
                           val_vec_type &Vals, int Offset);
   bool getConstantFieldValue(const GlobalAddressSDNode *Node, uint64_t Offset,
                              uint64_t Size, unsigned char *ByteSeq);
+  bool checkLoadDef(unsigned DefReg, unsigned match_load_op);
 
   // Mapping from ConstantStruct global value to corresponding byte-list values
   std::map<const void *, val_vec_type> cs_vals_;
+  // Mapping from vreg to load memory opcode
+  std::map<unsigned, unsigned> load_to_vreg_;
 };
 } // namespace
 
@@ -203,89 +212,110 @@ void BPFDAGToDAGISel::Select(SDNode *Node) {
   SelectCode(Node);
 }
 
+void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
+                                     SelectionDAG::allnodes_iterator I) {
+  union {
+    uint8_t c[8];
+    uint16_t s;
+    uint32_t i;
+    uint64_t d;
+  } new_val; // hold up the constant values replacing loads.
+  bool to_replace = false;
+  SDLoc DL(Node);
+  const LoadSDNode *LD = cast<LoadSDNode>(Node);
+  uint64_t size = LD->getMemOperand()->getSize();
+
+  if (!size || size > 8 || (size & (size - 1)))
+    return;
+
+  SDNode *LDAddrNode = LD->getOperand(1).getNode();
+  // Match LDAddr against either global_addr or (global_addr + offset)
+  unsigned opcode = LDAddrNode->getOpcode();
+  if (opcode == ISD::ADD) {
+    SDValue OP1 = LDAddrNode->getOperand(0);
+    SDValue OP2 = LDAddrNode->getOperand(1);
+
+    // We want to find the pattern global_addr + offset
+    SDNode *OP1N = OP1.getNode();
+    if (OP1N->getOpcode() <= ISD::BUILTIN_OP_END || OP1N->getNumOperands() == 0)
+      return;
+
+    DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
+
+    const GlobalAddressSDNode *GADN =
+        dyn_cast<GlobalAddressSDNode>(OP1N->getOperand(0).getNode());
+    const ConstantSDNode *CDN = dyn_cast<ConstantSDNode>(OP2.getNode());
+    if (GADN && CDN)
+      to_replace =
+          getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val.c);
+  } else if (LDAddrNode->getOpcode() > ISD::BUILTIN_OP_END &&
+             LDAddrNode->getNumOperands() > 0) {
+    DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
+
+    SDValue OP1 = LDAddrNode->getOperand(0);
+    if (const GlobalAddressSDNode *GADN =
+            dyn_cast<GlobalAddressSDNode>(OP1.getNode()))
+      to_replace = getConstantFieldValue(GADN, 0, size, new_val.c);
+  }
+
+  if (!to_replace)
+    return;
+
+  // replacing the old with a new value
+  uint64_t val;
+  if (size == 1)
+    val = new_val.c[0];
+  else if (size == 2)
+    val = new_val.s;
+  else if (size == 4)
+    val = new_val.i;
+  else {
+    val = new_val.d;
+  }
+
+  DEBUG(dbgs() << "Replacing load of size " << size << " with constant " << val
+               << '\n');
+  SDValue NVal = CurDAG->getConstant(val, DL, MVT::i64);
+
+  // After replacement, the current node is dead, we need to
+  // go backward one step to make iterator still work
+  I--;
+  SDValue From[] = {SDValue(Node, 0), SDValue(Node, 1)};
+  SDValue To[] = {NVal, NVal};
+  CurDAG->ReplaceAllUsesOfValuesWith(From, To, 2);
+  I++;
+  // It is safe to delete node now
+  CurDAG->DeleteNode(Node);
+}
+
 void BPFDAGToDAGISel::PreprocessISelDAG() {
-  // Iterate through all nodes, only interested in loads from ConstantStruct
-  // ConstantArray should have converted by IR->DAG processing
+  // Iterate through all nodes, interested in the following cases:
+  //
+  //  . loads from ConstantStruct or ConstantArray of constructs
+  //    which can be turns into constant itself, with this we can
+  //    avoid reading from read-only section at runtime.
+  //
+  //  . reg truncating is often the result of 8/16/32bit->64bit or
+  //    8/16bit->32bit conversion. If the reg value is loaded with
+  //    masked byte width, the AND operation can be removed since
+  //    BPF LOAD already has zero extension.
+  //
+  //    This also solved a correctness issue.
+  //    In BPF socket-related program, e.g., __sk_buff->{data, data_end}
+  //    are 32-bit registers, but later on, kernel verifier will rewrite
+  //    it with 64-bit value. Therefore, truncating the value after the
+  //    load will result in incorrect code.
   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
                                        E = CurDAG->allnodes_end();
        I != E;) {
     SDNode *Node = &*I++;
     unsigned Opcode = Node->getOpcode();
-    if (Opcode != ISD::LOAD)
-      continue;
-
-    union {
-      uint8_t c[8];
-      uint16_t s;
-      uint32_t i;
-      uint64_t d;
-    } new_val; // hold up the constant values replacing loads.
-    bool to_replace = false;
-    SDLoc DL(Node);
-    const LoadSDNode *LD = cast<LoadSDNode>(Node);
-    uint64_t size = LD->getMemOperand()->getSize();
-    if (!size || size > 8 || (size & (size - 1)))
-      continue;
-
-    SDNode *LDAddrNode = LD->getOperand(1).getNode();
-    // Match LDAddr against either global_addr or (global_addr + offset)
-    unsigned opcode = LDAddrNode->getOpcode();
-    if (opcode == ISD::ADD) {
-      SDValue OP1 = LDAddrNode->getOperand(0);
-      SDValue OP2 = LDAddrNode->getOperand(1);
-
-      // We want to find the pattern global_addr + offset
-      SDNode *OP1N = OP1.getNode();
-      if (OP1N->getOpcode() <= ISD::BUILTIN_OP_END ||
-          OP1N->getNumOperands() == 0)
-        continue;
-
-      DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
-
-      const GlobalAddressSDNode *GADN =
-          dyn_cast<GlobalAddressSDNode>(OP1N->getOperand(0).getNode());
-      const ConstantSDNode *CDN = dyn_cast<ConstantSDNode>(OP2.getNode());
-      if (GADN && CDN)
-        to_replace =
-            getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val.c);
-    } else if (LDAddrNode->getOpcode() > ISD::BUILTIN_OP_END &&
-               LDAddrNode->getNumOperands() > 0) {
-      DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
-
-      SDValue OP1 = LDAddrNode->getOperand(0);
-      if (const GlobalAddressSDNode *GADN =
-              dyn_cast<GlobalAddressSDNode>(OP1.getNode()))
-        to_replace = getConstantFieldValue(GADN, 0, size, new_val.c);
-    }
-
-    if (!to_replace)
-      continue;
-
-    // replacing the old with a new value
-    uint64_t val;
-    if (size == 1)
-      val = new_val.c[0];
-    else if (size == 2)
-      val = new_val.s;
-    else if (size == 4)
-      val = new_val.i;
-    else {
-      val = new_val.d;
-    }
-
-    DEBUG(dbgs() << "Replacing load of size " << size << " with constant "
-                 << val << '\n');
-    SDValue NVal = CurDAG->getConstant(val, DL, MVT::i64);
-
-    // After replacement, the current node is dead, we need to
-    // go backward one step to make iterator still work
-    I--;
-    SDValue From[] = {SDValue(Node, 0), SDValue(Node, 1)};
-    SDValue To[] = {NVal, NVal};
-    CurDAG->ReplaceAllUsesOfValuesWith(From, To, 2);
-    I++;
-    // It is safe to delete node now
-    CurDAG->DeleteNode(Node);
+    if (Opcode == ISD::LOAD)
+      PreprocessLoad(Node, I);
+    else if (Opcode == ISD::CopyToReg)
+      PreprocessCopyToReg(Node);
+    else if (Opcode == ISD::AND)
+      PreprocessTrunc(Node, I);
   }
 }
 
@@ -415,6 +445,134 @@ bool BPFDAGToDAGISel::fillConstantStruct(const DataLayout &DL,
   return true;
 }
 
+void BPFDAGToDAGISel::PreprocessCopyToReg(SDNode *Node) {
+  const RegisterSDNode *RegN = dyn_cast<RegisterSDNode>(Node->getOperand(1));
+  if (!RegN || !TargetRegisterInfo::isVirtualRegister(RegN->getReg()))
+    return;
+
+  const LoadSDNode *LD = dyn_cast<LoadSDNode>(Node->getOperand(2));
+  if (!LD)
+    return;
+
+  // Assign a load value to a virtual register. record its load width
+  unsigned mem_load_op = 0;
+  switch (LD->getMemOperand()->getSize()) {
+  default:
+    return;
+  case 4:
+    mem_load_op = BPF::LDW;
+    break;
+  case 2:
+    mem_load_op = BPF::LDH;
+    break;
+  case 1:
+    mem_load_op = BPF::LDB;
+    break;
+  }
+
+  DEBUG(dbgs() << "Find Load Value to VReg "
+               << TargetRegisterInfo::virtReg2Index(RegN->getReg()) << '\n');
+  load_to_vreg_[RegN->getReg()] = mem_load_op;
+}
+
+void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
+                                      SelectionDAG::allnodes_iterator I) {
+  ConstantSDNode *MaskN = dyn_cast<ConstantSDNode>(Node->getOperand(1));
+  if (!MaskN)
+    return;
+
+  unsigned match_load_op = 0;
+  switch (MaskN->getZExtValue()) {
+  default:
+    return;
+  case 0xFFFFFFFF:
+    match_load_op = BPF::LDW;
+    break;
+  case 0xFFFF:
+    match_load_op = BPF::LDH;
+    break;
+  case 0xFF:
+    match_load_op = BPF::LDB;
+    break;
+  }
+
+  // The Reg operand should be a virtual register, which is defined
+  // outside the current basic block. DAG combiner has done a pretty
+  // good job in removing truncating inside a single basic block.
+  SDValue BaseV = Node->getOperand(0);
+  if (BaseV.getOpcode() != ISD::CopyFromReg)
+    return;
+
+  const RegisterSDNode *RegN =
+      dyn_cast<RegisterSDNode>(BaseV.getNode()->getOperand(1));
+  if (!RegN || !TargetRegisterInfo::isVirtualRegister(RegN->getReg()))
+    return;
+  unsigned AndOpReg = RegN->getReg();
+  DEBUG(dbgs() << "Examine %vreg" << TargetRegisterInfo::virtReg2Index(AndOpReg)
+               << '\n');
+
+  // Examine the PHI insns in the MachineBasicBlock to found out the
+  // definitions of this virtual register. At this stage (DAG2DAG
+  // transformation), only PHI machine insns are available in the machine basic
+  // block.
+  MachineBasicBlock *MBB = FuncInfo->MBB;
+  MachineInstr *MII = nullptr;
+  for (auto &MI : *MBB) {
+    for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
+      const MachineOperand &MOP = MI.getOperand(i);
+      if (!MOP.isReg() || !MOP.isDef())
+        continue;
+      unsigned Reg = MOP.getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg) && Reg == AndOpReg) {
+        MII = &MI;
+        break;
+      }
+    }
+  }
+
+  if (MII == nullptr) {
+    // No phi definition in this block.
+    if (!checkLoadDef(AndOpReg, match_load_op))
+      return;
+  } else {
+    // The PHI node looks like:
+    //   %vreg2<def> = PHI %vreg0, <BB#1>, %vreg1, <BB#3>
+    // Trace each incoming definition, e.g., (%vreg0, BB#1) and (%vreg1, BB#3)
+    // The AND operation can be removed if both %vreg0 in BB#1 and %vreg1 in
+    // BB#3 are defined with with a load matching the MaskN.
+    DEBUG(dbgs() << "Check PHI Insn: "; MII->dump(); dbgs() << '\n');
+    unsigned PrevReg = -1;
+    for (unsigned i = 0; i < MII->getNumOperands(); ++i) {
+      const MachineOperand &MOP = MII->getOperand(i);
+      if (MOP.isReg()) {
+        if (MOP.isDef())
+          continue;
+        PrevReg = MOP.getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(PrevReg))
+          return;
+        if (!checkLoadDef(PrevReg, match_load_op))
+          return;
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "Remove the redundant AND operation in: "; Node->dump();
+        dbgs() << '\n');
+
+  I--;
+  CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV);
+  I++;
+  CurDAG->DeleteNode(Node);
+}
+
+bool BPFDAGToDAGISel::checkLoadDef(unsigned DefReg, unsigned match_load_op) {
+  auto it = load_to_vreg_.find(DefReg);
+  if (it == load_to_vreg_.end())
+    return false; // The definition of register is not exported yet.
+
+  return it->second == match_load_op;
+}
+
 FunctionPass *llvm::createBPFISelDag(BPFTargetMachine &TM) {
   return new BPFDAGToDAGISel(TM);
 }
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 2b0ceaa66258..97a53dcbaed7 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -178,8 +178,8 @@ static cl::opt<bool> EnableSaveRestoreLong("enable-save-restore-long",
     cl::Hidden, cl::desc("Enable long calls for save-restore stubs."),
     cl::init(false), cl::ZeroOrMore);
 
-static cl::opt<bool> UseAllocframe("use-allocframe", cl::init(true),
-    cl::Hidden, cl::desc("Use allocframe more conservatively"));
+static cl::opt<bool> EliminateFramePointer("hexagon-fp-elim", cl::init(true),
+    cl::Hidden, cl::desc("Refrain from using FP whenever possible"));
 
 static cl::opt<bool> OptimizeSpillSlots("hexagon-opt-spill", cl::Hidden,
     cl::init(true), cl::desc("Optimize spill slots"));
@@ -550,7 +550,6 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB,
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   auto &HII = *HST.getInstrInfo();
   auto &HRI = *HST.getRegisterInfo();
-  DebugLoc dl;
 
   unsigned MaxAlign = std::max(MFI.getMaxAlignment(), getStackAlignment());
 
@@ -584,77 +583,56 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB,
     MI->eraseFromParent();
   }
 
-  if (!hasFP(MF))
-    return;
-
-  // Check for overflow.
-  // Hexagon_TODO: Ugh! hardcoding. Is there an API that can be used?
-  const unsigned int ALLOCFRAME_MAX = 16384;
+  DebugLoc dl = MBB.findDebugLoc(InsertPt);
 
-  // Create a dummy memory operand to avoid allocframe from being treated as
-  // a volatile memory reference.
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOStore,
-                            4, 4);
-
-  if (NumBytes >= ALLOCFRAME_MAX) {
-    // Emit allocframe(#0).
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe))
-      .addImm(0)
-      .addMemOperand(MMO);
-
-    // Subtract offset from frame pointer.
-    // We use a caller-saved non-parameter register for that.
-    unsigned CallerSavedReg = HRI.getFirstCallerSavedNonParamReg();
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::CONST32),
-            CallerSavedReg).addImm(NumBytes);
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_sub), SP)
+  if (hasFP(MF)) {
+    insertAllocframe(MBB, InsertPt, NumBytes);
+    if (AlignStack) {
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_andir), SP)
+          .addReg(SP)
+          .addImm(-int64_t(MaxAlign));
+    }
+    // If the stack-checking is enabled, and we spilled the callee-saved
+    // registers inline (i.e. did not use a spill function), then call
+    // the stack checker directly.
+    if (EnableStackOVFSanitizer && !PrologueStubs)
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::PS_call_stk))
+             .addExternalSymbol("__runtime_stack_check");
+  } else if (NumBytes > 0) {
+    assert(alignTo(NumBytes, 8) == NumBytes);
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP)
       .addReg(SP)
-      .addReg(CallerSavedReg);
-  } else {
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe))
-      .addImm(NumBytes)
-      .addMemOperand(MMO);
+      .addImm(-int(NumBytes));
   }
-
-  if (AlignStack) {
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_andir), SP)
-        .addReg(SP)
-        .addImm(-int64_t(MaxAlign));
-  }
-
-  // If the stack-checking is enabled, and we spilled the callee-saved
-  // registers inline (i.e. did not use a spill function), then call
-  // the stack checker directly.
-  if (EnableStackOVFSanitizer && !PrologueStubs)
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::PS_call_stk))
-           .addExternalSymbol("__runtime_stack_check");
 }
 
 void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const {
   MachineFunction &MF = *MBB.getParent();
-  if (!hasFP(MF))
-    return;
-
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   auto &HII = *HST.getInstrInfo();
   auto &HRI = *HST.getRegisterInfo();
   unsigned SP = HRI.getStackRegister();
 
+  MachineBasicBlock::iterator InsertPt = MBB.getFirstTerminator();
+  DebugLoc dl = MBB.findDebugLoc(InsertPt);
+
+  if (!hasFP(MF)) {
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    if (unsigned NumBytes = MFI.getStackSize()) {
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP)
+        .addReg(SP)
+        .addImm(NumBytes);
+    }
+    return;
+  }
+
   MachineInstr *RetI = getReturn(MBB);
   unsigned RetOpc = RetI ? RetI->getOpcode() : 0;
 
-  MachineBasicBlock::iterator InsertPt = MBB.getFirstTerminator();
-  DebugLoc DL;
-  if (InsertPt != MBB.end())
-    DL = InsertPt->getDebugLoc();
-  else if (!MBB.empty())
-    DL = std::prev(MBB.end())->getDebugLoc();
-
   // Handle EH_RETURN.
   if (RetOpc == Hexagon::EH_RETURN_JMPR) {
-    BuildMI(MBB, InsertPt, DL, HII.get(Hexagon::L2_deallocframe));
-    BuildMI(MBB, InsertPt, DL, HII.get(Hexagon::A2_add), SP)
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::L2_deallocframe));
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_add), SP)
         .addReg(SP)
         .addReg(Hexagon::R28);
     return;
@@ -699,16 +677,52 @@ void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const {
   // otherwise just add deallocframe. The function could be returning via a
   // tail call.
   if (RetOpc != Hexagon::PS_jmpret || DisableDeallocRet) {
-    BuildMI(MBB, InsertPt, DL, HII.get(Hexagon::L2_deallocframe));
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::L2_deallocframe));
     return;
   }
   unsigned NewOpc = Hexagon::L4_return;
-  MachineInstr *NewI = BuildMI(MBB, RetI, DL, HII.get(NewOpc));
+  MachineInstr *NewI = BuildMI(MBB, RetI, dl, HII.get(NewOpc));
   // Transfer the function live-out registers.
   NewI->copyImplicitOps(MF, *RetI);
   MBB.erase(RetI);
 }
 
+void HexagonFrameLowering::insertAllocframe(MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator InsertPt, unsigned NumBytes) const {
+  MachineFunction &MF = *MBB.getParent();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HII = *HST.getInstrInfo();
+  auto &HRI = *HST.getRegisterInfo();
+
+  // Check for overflow.
+  // Hexagon_TODO: Ugh! hardcoding. Is there an API that can be used?
+  const unsigned int ALLOCFRAME_MAX = 16384;
+
+  // Create a dummy memory operand to avoid allocframe from being treated as
+  // a volatile memory reference.
+  auto *MMO = MF.getMachineMemOperand(MachinePointerInfo::getStack(MF, 0),
+                                      MachineMemOperand::MOStore, 4, 4);
+
+  DebugLoc dl = MBB.findDebugLoc(InsertPt);
+
+  if (NumBytes >= ALLOCFRAME_MAX) {
+    // Emit allocframe(#0).
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe))
+      .addImm(0)
+      .addMemOperand(MMO);
+
+    // Subtract the size from the stack pointer.
+    unsigned SP = HRI.getStackRegister();
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP)
+      .addReg(SP)
+      .addImm(-int(NumBytes));
+  } else {
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe))
+      .addImm(NumBytes)
+      .addMemOperand(MMO);
+  }
+}
+
 void HexagonFrameLowering::updateEntryPaths(MachineFunction &MF,
       MachineBasicBlock &SaveB) const {
   SetVector<unsigned> Worklist;
@@ -928,12 +942,11 @@ void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
 }
 
 bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const {
+  if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
+    return false;
+
   auto &MFI = MF.getFrameInfo();
   auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
-
-  bool HasFixed = MFI.getNumFixedObjects();
-  bool HasPrealloc = const_cast<MachineFrameInfo&>(MFI)
-                        .getLocalFrameObjectCount();
   bool HasExtraAlign = HRI.needsStackRealignment(MF);
   bool HasAlloca = MFI.hasVarSizedObjects();
 
@@ -947,18 +960,35 @@ bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const {
 
   // By default we want to use SP (since it's always there). FP requires
   // some setup (i.e. ALLOCFRAME).
-  // Fixed and preallocated objects need FP if the distance from them to
-  // the SP is unknown (as is with alloca or aligna).
-  if ((HasFixed || HasPrealloc) && (HasAlloca || HasExtraAlign))
+  // Both, alloca and stack alignment modify the stack pointer by an
+  // undetermined value, so we need to save it at the entry to the function
+  // (i.e. use allocframe).
+  if (HasAlloca || HasExtraAlign)
     return true;
 
   if (MFI.getStackSize() > 0) {
-    if (EnableStackOVFSanitizer || UseAllocframe)
+    // If FP-elimination is disabled, we have to use FP at this point.
+    const TargetMachine &TM = MF.getTarget();
+    if (TM.Options.DisableFramePointerElim(MF) || !EliminateFramePointer)
+      return true;
+    if (EnableStackOVFSanitizer)
       return true;
   }
 
-  if (MFI.hasCalls() ||
-      MF.getInfo<HexagonMachineFunctionInfo>()->hasClobberLR())
+  const auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
+  if (MFI.hasCalls() || HMFI.hasClobberLR())
+    return true;
+
+  // Frame pointer elimination is a possiblility at this point, but
+  // to know if FP is necessary we need to know if spill/restore
+  // functions will be used (they require FP to be valid).
+  // This means that hasFP shouldn't really be called before CSI is
+  // calculated, and some measures are taken to make sure of that
+  // (e.g. default implementations of virtual functions that call it
+  // are overridden apropriately).
+  assert(MFI.isCalleeSavedInfoValid() && "Need to know CSI");
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  if (useSpillFunction(MF, CSI) || useRestoreFunction(MF, CSI))
     return true;
 
   return false;
@@ -1051,9 +1081,10 @@ int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
   bool HasExtraAlign = HRI.needsStackRealignment(MF);
   bool NoOpt = MF.getTarget().getOptLevel() == CodeGenOpt::None;
 
-  unsigned FrameSize = MFI.getStackSize();
-  unsigned SP = HRI.getStackRegister(), FP = HRI.getFrameRegister();
   auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
+  unsigned FrameSize = MFI.getStackSize();
+  unsigned SP = HRI.getStackRegister();
+  unsigned FP = HRI.getFrameRegister();
   unsigned AP = HMFI.getStackAlignBasePhysReg();
   // It may happen that AP will be absent even HasAlloca && HasExtraAlign
   // is true. HasExtraAlign may be set because of vector spills, without
@@ -1135,7 +1166,7 @@ int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
   // there will be no SP -= FrameSize), so the frame size should not be
   // added to the calculated offset.
   int RealOffset = Offset;
-  if (!UseFP && !UseAP && HasFP)
+  if (!UseFP && !UseAP)
     RealOffset = FrameSize+Offset;
   return RealOffset;
 }
@@ -2402,7 +2433,7 @@ void HexagonFrameLowering::addCalleeSaveRegistersAsImpOperand(MachineInstr *MI,
 /// be generated via inline code. If this function returns "true", inline
 /// code will be generated. If this function returns "false", additional
 /// checks are performed, which may still lead to the inline code.
-bool HexagonFrameLowering::shouldInlineCSR(MachineFunction &MF,
+bool HexagonFrameLowering::shouldInlineCSR(const MachineFunction &MF,
       const CSIVect &CSI) const {
   if (MF.getInfo<HexagonMachineFunctionInfo>()->hasEHReturn())
     return true;
@@ -2432,7 +2463,7 @@ bool HexagonFrameLowering::shouldInlineCSR(MachineFunction &MF,
   return false;
 }
 
-bool HexagonFrameLowering::useSpillFunction(MachineFunction &MF,
+bool HexagonFrameLowering::useSpillFunction(const MachineFunction &MF,
       const CSIVect &CSI) const {
   if (shouldInlineCSR(MF, CSI))
     return false;
@@ -2445,7 +2476,7 @@ bool HexagonFrameLowering::useSpillFunction(MachineFunction &MF,
   return Threshold < NumCSI;
 }
 
-bool HexagonFrameLowering::useRestoreFunction(MachineFunction &MF,
+bool HexagonFrameLowering::useRestoreFunction(const MachineFunction &MF,
       const CSIVect &CSI) const {
   if (shouldInlineCSR(MF, CSI))
     return false;
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h
index 529a61d4a5b5..f4d4e1b61a26 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -48,6 +48,15 @@ public:
     return true;
   }
 
+  bool hasReservedCallFrame(const MachineFunction &MF) const override {
+    // We always reserve call frame as a part of the initial stack allocation.
+    return true;
+  }
+  bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override {
+    // Override this function to avoid calling hasFP before CSI is set
+    // (the default implementation calls hasFP).
+    return true;
+  }
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I) const override;
@@ -94,6 +103,8 @@ private:
       unsigned SP, unsigned CF) const;
   void insertPrologueInBlock(MachineBasicBlock &MBB, bool PrologueStubs) const;
   void insertEpilogueInBlock(MachineBasicBlock &MBB) const;
+  void insertAllocframe(MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator InsertPt, unsigned NumBytes) const;
   bool insertCSRSpillsInBlock(MachineBasicBlock &MBB, const CSIVect &CSI,
       const HexagonRegisterInfo &HRI, bool &PrologueStubs) const;
   bool insertCSRRestoresInBlock(MachineBasicBlock &MBB, const CSIVect &CSI,
@@ -148,9 +159,9 @@ private:
 
   void addCalleeSaveRegistersAsImpOperand(MachineInstr *MI, const CSIVect &CSI,
       bool IsDef, bool IsKill) const;
-  bool shouldInlineCSR(MachineFunction &MF, const CSIVect &CSI) const;
-  bool useSpillFunction(MachineFunction &MF, const CSIVect &CSI) const;
-  bool useRestoreFunction(MachineFunction &MF, const CSIVect &CSI) const;
+  bool shouldInlineCSR(const MachineFunction &MF, const CSIVect &CSI) const;
+  bool useSpillFunction(const MachineFunction &MF, const CSIVect &CSI) const;
+  bool useRestoreFunction(const MachineFunction &MF, const CSIVect &CSI) const;
   bool mayOverflowFrameOffset(MachineFunction &MF) const;
 };
 
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index afed894cfb9a..2daacf795555 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1002,51 +1002,46 @@ bool HexagonTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
 
 SDValue
 HexagonTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
-  SDNode *Node = Op.getNode();
   MachineFunction &MF = DAG.getMachineFunction();
-  auto &FuncInfo = *MF.getInfo<HexagonMachineFunctionInfo>();
-  switch (Node->getOpcode()) {
-    case ISD::INLINEASM: {
-      unsigned NumOps = Node->getNumOperands();
-      if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
-        --NumOps;  // Ignore the flag operand.
-
-      for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
-        if (FuncInfo.hasClobberLR())
-          break;
-        unsigned Flags =
-          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
-        unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
-        ++i;  // Skip the ID value.
-
-        switch (InlineAsm::getKind(Flags)) {
-        default: llvm_unreachable("Bad flags!");
-          case InlineAsm::Kind_RegDef:
-          case InlineAsm::Kind_RegUse:
-          case InlineAsm::Kind_Imm:
-          case InlineAsm::Kind_Clobber:
-          case InlineAsm::Kind_Mem: {
-            for (; NumVals; --NumVals, ++i) {}
-            break;
-          }
-          case InlineAsm::Kind_RegDefEarlyClobber: {
-            for (; NumVals; --NumVals, ++i) {
-              unsigned Reg =
-                cast<RegisterSDNode>(Node->getOperand(i))->getReg();
-
-              // Check it to be lr
-              const HexagonRegisterInfo *QRI = Subtarget.getRegisterInfo();
-              if (Reg == QRI->getRARegister()) {
-                FuncInfo.setHasClobberLR(true);
-                break;
-              }
-            }
-            break;
-          }
+  auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
+  const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
+  unsigned LR = HRI.getRARegister();
+
+  if (Op.getOpcode() != ISD::INLINEASM || HMFI.hasClobberLR())
+    return Op;
+
+  unsigned NumOps = Op.getNumOperands();
+  if (Op.getOperand(NumOps-1).getValueType() == MVT::Glue)
+    --NumOps;  // Ignore the flag operand.
+
+  for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+    unsigned Flags = cast<ConstantSDNode>(Op.getOperand(i))->getZExtValue();
+    unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+    ++i;  // Skip the ID value.
+
+    switch (InlineAsm::getKind(Flags)) {
+      default:
+        llvm_unreachable("Bad flags!");
+      case InlineAsm::Kind_RegUse:
+      case InlineAsm::Kind_Imm:
+      case InlineAsm::Kind_Mem:
+        i += NumVals;
+        break;
+      case InlineAsm::Kind_Clobber:
+      case InlineAsm::Kind_RegDef:
+      case InlineAsm::Kind_RegDefEarlyClobber: {
+        for (; NumVals; --NumVals, ++i) {
+          unsigned Reg = cast<RegisterSDNode>(Op.getOperand(i))->getReg();
+          if (Reg != LR)
+            continue;
+          HMFI.setHasClobberLR(true);
+          return Op;
         }
+        break;
       }
     }
-  } // Node->getOpcode
+  }
+
   return Op;
 }
 
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index fec2dc5ce306..1eac2d3dd8e2 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1253,10 +1253,16 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       LivePhysRegs LiveAtMI(HRI);
       getLiveRegsAt(LiveAtMI, MI);
       bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
+      unsigned PReg = Op1.getReg();
+      assert(Op1.getSubReg() == 0);
+      unsigned PState = getRegState(Op1);
+
       if (Op0.getReg() != Op2.getReg()) {
+        unsigned S = Op0.getReg() != Op3.getReg() ? PState & ~RegState::Kill
+                                                  : PState;
         auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vcmov))
                      .add(Op0)
-                     .add(Op1)
+                     .addReg(PReg, S)
                      .add(Op2);
         if (IsDestLive)
           T.addReg(Op0.getReg(), RegState::Implicit);
@@ -1265,7 +1271,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       if (Op0.getReg() != Op3.getReg()) {
         auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vncmov))
                      .add(Op0)
-                     .add(Op1)
+                     .addReg(PReg, PState)
                      .add(Op3);
         if (IsDestLive)
           T.addReg(Op0.getReg(), RegState::Implicit);
@@ -1282,12 +1288,18 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       LivePhysRegs LiveAtMI(HRI);
       getLiveRegsAt(LiveAtMI, MI);
       bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
+      unsigned PReg = Op1.getReg();
+      assert(Op1.getSubReg() == 0);
+      unsigned PState = getRegState(Op1);
 
       if (Op0.getReg() != Op2.getReg()) {
+        unsigned S = Op0.getReg() != Op3.getReg() ? PState & ~RegState::Kill
+                                                  : PState;
         unsigned SrcLo = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_lo);
         unsigned SrcHi = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_hi);
         auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vccombine))
                      .add(Op0)
+                     .addReg(PReg, S)
                      .add(Op1)
                      .addReg(SrcHi)
                      .addReg(SrcLo);
@@ -1300,7 +1312,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
         unsigned SrcHi = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_hi);
         auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vnccombine))
                      .add(Op0)
-                     .add(Op1)
+                     .addReg(PReg, PState)
                      .addReg(SrcHi)
                      .addReg(SrcLo);
         if (IsDestLive)
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index de6b203015d8..e93f075f4ccd 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -69,9 +69,7 @@ namespace {
   public:
     static char ID;
 
-    HexagonNewValueJump() : MachineFunctionPass(ID) {
-      initializeHexagonNewValueJumpPass(*PassRegistry::getPassRegistry());
-    }
+    HexagonNewValueJump() : MachineFunctionPass(ID) {}
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineBranchProbabilityInfo>();
@@ -445,8 +443,6 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
     unsigned predReg = 0; // predicate reg of the jump.
     unsigned cmpReg1 = 0;
     int cmpOp2 = 0;
-    bool MO1IsKill = false;
-    bool MO2IsKill = false;
     MachineBasicBlock::iterator jmpPos;
     MachineBasicBlock::iterator cmpPos;
     MachineInstr *cmpInstr = nullptr, *jmpInstr = nullptr;
@@ -548,14 +544,10 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
           // We need cmpReg1 and cmpOp2(imm or reg) while building
           // new value jump instruction.
           cmpReg1 = MI.getOperand(1).getReg();
-          if (MI.getOperand(1).isKill())
-            MO1IsKill = true;
 
-          if (isSecondOpReg) {
+          if (isSecondOpReg)
             cmpOp2 = MI.getOperand(2).getReg();
-            if (MI.getOperand(2).isKill())
-              MO2IsKill = true;
-          } else
+          else
             cmpOp2 = MI.getOperand(2).getImm();
           continue;
         }
@@ -605,11 +597,8 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
             if ((COp == Hexagon::C2_cmpeq || COp == Hexagon::C4_cmpneq) &&
                 (feederReg == (unsigned) cmpOp2)) {
               unsigned tmp = cmpReg1;
-              bool tmpIsKill = MO1IsKill;
               cmpReg1 = cmpOp2;
-              MO1IsKill = MO2IsKill;
               cmpOp2 = tmp;
-              MO2IsKill = tmpIsKill;
             }
 
             // Now we have swapped the operands, all we need to check is,
@@ -623,31 +612,33 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
           // make sure we are respecting the kill values of
           // the operands of the feeder.
 
-          bool updatedIsKill = false;
-          for (unsigned i = 0; i < MI.getNumOperands(); i++) {
-            MachineOperand &MO = MI.getOperand(i);
-            if (MO.isReg() && MO.isUse()) {
-              unsigned feederReg = MO.getReg();
-              for (MachineBasicBlock::iterator localII = feederPos,
-                   end = cmpInstr->getIterator(); localII != end; localII++) {
-                MachineInstr &localMI = *localII;
-                for (unsigned j = 0; j < localMI.getNumOperands(); j++) {
-                  MachineOperand &localMO = localMI.getOperand(j);
-                  if (localMO.isReg() && localMO.isUse() &&
-                      localMO.isKill() && feederReg == localMO.getReg()) {
-                    // We found that there is kill of a use register
-                    // Set up a kill flag on the register
-                    localMO.setIsKill(false);
-                    MO.setIsKill();
-                    updatedIsKill = true;
-                    break;
-                  }
+          auto TransferKills = [jmpPos,cmpPos] (MachineInstr &MI) {
+            for (MachineOperand &MO : MI.operands()) {
+              if (!MO.isReg() || !MO.isUse())
+                continue;
+              unsigned UseR = MO.getReg();
+              for (auto I = std::next(MI.getIterator()); I != jmpPos; ++I) {
+                if (I == cmpPos)
+                  continue;
+                for (MachineOperand &Op : I->operands()) {
+                  if (!Op.isReg() || !Op.isUse() || !Op.isKill())
+                    continue;
+                  if (Op.getReg() != UseR)
+                    continue;
+                  // We found that there is kill of a use register
+                  // Set up a kill flag on the register
+                  Op.setIsKill(false);
+                  MO.setIsKill(true);
+                  return;
                 }
-                if (updatedIsKill) break;
               }
             }
-            if (updatedIsKill) break;
-          }
+          };
+
+          TransferKills(*feederPos);
+          TransferKills(*cmpPos);
+          bool MO1IsKill = cmpPos->killsRegister(cmpReg1, QRI);
+          bool MO2IsKill = isSecondOpReg && cmpPos->killsRegister(cmpOp2, QRI);
 
           MBB->splice(jmpPos, MI.getParent(), MI);
           MBB->splice(jmpPos, MI.getParent(), cmpInstr);
diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index 27b40f134b1f..a331c978f59d 100644
--- a/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -535,9 +535,9 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
         !MI->getOperand(1).isGlobal())
       continue;
 
-    DEBUG(dbgs() << "[Analyzing A2_tfrsi]: " << *MI << "\n");
-    DEBUG(dbgs() << "\t[InstrNode]: " << Print<NodeAddr<InstrNode *>>(IA, *DFG)
-                 << "\n");
+    DEBUG(dbgs() << "[Analyzing " << HII->getName(MI->getOpcode()) << "]: "
+                 << *MI << "\n\t[InstrNode]: "
+                 << Print<NodeAddr<InstrNode *>>(IA, *DFG) << '\n');
 
     NodeList UNodeList;
     getAllRealUses(SA, UNodeList);
@@ -605,7 +605,9 @@ bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
   const TargetOperandInfo TOI(*HII);
 
   DataFlowGraph G(MF, *HII, TRI, *MDT, MDF, TOI);
-  G.build();
+  // Need to keep dead phis because we can propagate uses of registers into
+  // nodes dominated by those would-be phis.
+  G.build(BuildOptions::KeepDeadPhis);
   DFG = &G;
 
   Liveness L(MRI, *DFG);
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 031a1bdefafb..76d9b31b005f 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -113,6 +113,7 @@ namespace llvm {
   void initializeHexagonLoopIdiomRecognizePass(PassRegistry&);
   void initializeHexagonGenMuxPass(PassRegistry&);
   void initializeHexagonOptAddrModePass(PassRegistry&);
+  void initializeHexagonNewValueJumpPass(PassRegistry&);
   Pass *createHexagonLoopIdiomPass();
 
   FunctionPass *createHexagonBitSimplify();
@@ -158,6 +159,7 @@ extern "C" void LLVMInitializeHexagonTarget() {
   initializeHexagonLoopIdiomRecognizePass(PR);
   initializeHexagonGenMuxPass(PR);
   initializeHexagonOptAddrModePass(PR);
+  initializeHexagonNewValueJumpPass(PR);
 }
 
 HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index 4dacb1501392..34df2ebcc520 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -49,6 +49,10 @@ static cl::opt<bool> TraceGVPlacement("trace-gv-placement",
   cl::Hidden, cl::init(false),
   cl::desc("Trace global value placement"));
 
+static cl::opt<bool>
+    EmitJtInText("hexagon-emit-jt-text", cl::Hidden, cl::init(false),
+                 cl::desc("Emit hexagon jump tables in function section"));
+
 // TraceGVPlacement controls messages for all builds. For builds with assertions
 // (debug or release), messages are also controlled by the usual debug flags
 // (e.g. -debug and -debug-only=globallayout)
@@ -256,6 +260,11 @@ unsigned HexagonTargetObjectFile::getSmallDataSize() const {
   return SmallDataThreshold;
 }
 
+bool HexagonTargetObjectFile::shouldPutJumpTableInFunctionSection(
+    bool UsesLabelDifference, const Function &F) const {
+  return EmitJtInText;
+}
+
 /// Descends any type down to "elementary" components,
 /// discovering the smallest addressable one.
 /// If zero is returned, declaration will not be modified.
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.h b/lib/Target/Hexagon/HexagonTargetObjectFile.h
index 58dff2b95e19..373d850b53be 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.h
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.h
@@ -33,6 +33,9 @@ namespace llvm {
 
     unsigned getSmallDataSize() const;
 
+    bool shouldPutJumpTableInFunctionSection(bool UsesLabelDifference,
+                                             const Function &F) const override;
+
   private:
     MCSectionELF *SmallDataSection;
     MCSectionELF *SmallBSSSection;
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index d578bfab3658..aac810e29fe9 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -21,6 +21,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "hexagontti"
 
+static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables",
+  cl::init(true), cl::Hidden,
+  cl::desc("Control lookup table emission on Hexagon target"));
+
 TargetTransformInfo::PopcntSupportKind
 HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
   // Return Fast Hardware support as every input  < 64 bits will be promoted
@@ -29,7 +33,7 @@ HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
 }
 
 // The Hexagon target can unroll loops with run-time trip counts.
-void HexagonTTIImpl::getUnrollingPreferences(Loop *L,
+void HexagonTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                              TTI::UnrollingPreferences &UP) {
   UP.Runtime = UP.Partial = true;
 }
@@ -46,8 +50,9 @@ unsigned HexagonTTIImpl::getCacheLineSize() const {
   return getST()->getL1CacheLineSize();
 }
 
-int HexagonTTIImpl::getUserCost(const User *U) {
-  auto isCastFoldedIntoLoad = [] (const CastInst *CI) -> bool {
+int HexagonTTIImpl::getUserCost(const User *U,
+                                ArrayRef<const Value *> Operands) {
+  auto isCastFoldedIntoLoad = [](const CastInst *CI) -> bool {
     if (!CI->isIntegerCast())
       return false;
     const LoadInst *LI = dyn_cast<const LoadInst>(CI->getOperand(0));
@@ -67,5 +72,9 @@ int HexagonTTIImpl::getUserCost(const User *U) {
   if (const CastInst *CI = dyn_cast<const CastInst>(U))
     if (isCastFoldedIntoLoad(CI))
       return TargetTransformInfo::TCC_Free;
-  return BaseT::getUserCost(U);
+  return BaseT::getUserCost(U, Operands);
+}
+
+bool HexagonTTIImpl::shouldBuildLookupTables() const {
+   return EmitLookupTables;
 }
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 8414bfc4e197..ab5a6e07d873 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -46,7 +46,8 @@ public:
   TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
 
   // The Hexagon target can unroll loops with run-time trip counts.
-  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
 
   // L1 cache prefetch.
   unsigned getPrefetchDistance() const;
@@ -61,7 +62,10 @@ public:
 
   /// @}
 
-  int getUserCost(const User *U);
+  int getUserCost(const User *U, ArrayRef<const Value *> Operands);
+
+  // Hexagon specific decision to generate a lookup table.
+  bool shouldBuildLookupTables() const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 093ce80bc2e3..34d0b55aa22a 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -199,11 +199,8 @@ public:
     return Infos[Kind - FirstTargetFixupKind];
   }
 
-  /// processFixupValue - Target hook to adjust the literal value of a fixup
-  /// if necessary. IsResolved signals whether the caller believes a relocation
-  /// is needed; the target can modify the value. The default does nothing.
-  void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
-                         const MCValue &Target, bool &IsResolved) override {
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override {
     MCFixupKind Kind = Fixup.getKind();
 
     switch((unsigned)Kind) {
@@ -299,8 +296,7 @@ public:
       case fixup_Hexagon_LD_PLT_B22_PCREL_X:
       case fixup_Hexagon_LD_PLT_B32_PCREL_X:
         // These relocations should always have a relocation recorded
-        IsResolved = false;
-        return;
+        return true;
 
       case fixup_Hexagon_B22_PCREL:
         //IsResolved = false;
@@ -317,7 +313,7 @@ public:
       case fixup_Hexagon_B7_PCREL:
       case fixup_Hexagon_B7_PCREL_X:
         if (DisableFixup)
-          IsResolved = false;
+          return true;
         break;
 
       case FK_Data_1:
@@ -326,8 +322,9 @@ public:
       case FK_PCRel_4:
       case fixup_Hexagon_32:
         // Leave these relocations alone as they are used for EH.
-        return;
+        return false;
     }
+    return false;
   }
 
   /// getFixupKindNumBytes - The number of bytes the fixup may change.
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 9d5c179a0fd9..69b1ba1528d0 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -2789,6 +2789,7 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
                                             bool Is32BitSym, SMLoc IDLoc,
                                             MCStreamer &Out,
                                             const MCSubtargetInfo *STI) {
+  // FIXME: These expansions do not respect -mxgot.
   MipsTargetStreamer &TOut = getTargetStreamer();
   bool UseSrcReg = SrcReg != Mips::NoRegister;
   warnIfNoMacro(IDLoc);
@@ -2808,8 +2809,12 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     // symbol in the final relocation is external and not modified with a
     // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT16.
     if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg &&
-        Res.getConstant() == 0 && !Res.getSymA()->getSymbol().isInSection() &&
-        !Res.getSymA()->getSymbol().isTemporary()) {
+        Res.getConstant() == 0 &&
+        !(Res.getSymA()->getSymbol().isInSection() ||
+          Res.getSymA()->getSymbol().isTemporary() ||
+          (Res.getSymA()->getSymbol().isELF() &&
+           cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() ==
+               ELF::STB_LOCAL))) {
       const MCExpr *CallExpr =
           MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
       TOut.emitRRX(Mips::LW, DstReg, ABI.GetGlobalPtr(),
@@ -2865,6 +2870,85 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     return false;
   }
 
+  if (inPicMode() && ABI.ArePtrs64bit()) {
+    MCValue Res;
+    if (!SymExpr->evaluateAsRelocatable(Res, nullptr, nullptr)) {
+      Error(IDLoc, "expected relocatable expression");
+      return true;
+    }
+    if (Res.getSymB() != nullptr) {
+      Error(IDLoc, "expected relocatable expression with only one symbol");
+      return true;
+    }
+
+    // The case where the result register is $25 is somewhat special. If the
+    // symbol in the final relocation is external and not modified with a
+    // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT_DISP.
+    if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg &&
+        Res.getConstant() == 0 &&
+        !(Res.getSymA()->getSymbol().isInSection() ||
+          Res.getSymA()->getSymbol().isTemporary() ||
+          (Res.getSymA()->getSymbol().isELF() &&
+           cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() ==
+               ELF::STB_LOCAL))) {
+      const MCExpr *CallExpr =
+          MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
+      TOut.emitRRX(Mips::LD, DstReg, ABI.GetGlobalPtr(),
+                   MCOperand::createExpr(CallExpr), IDLoc, STI);
+      return false;
+    }
+
+    // The remaining cases are:
+    //   Small offset: ld $tmp, %got_disp(symbol)($gp)
+    //                >daddiu $tmp, $tmp, offset
+    //                >daddu $rd, $tmp, $rs
+    // The daddiu's marked with a '>' may be omitted if they are redundant. If
+    // this happens then the last instruction must use $rd as the result
+    // register.
+    const MipsMCExpr *GotExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP,
+                                                   Res.getSymA(),
+                                                   getContext());
+    const MCExpr *LoExpr = nullptr;
+    if (Res.getConstant() != 0) {
+      // Symbols fully resolve with just the %got_disp(symbol) but we
+      // must still account for any offset to the symbol for
+      // expressions like symbol+8.
+      LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
+
+      // FIXME: Offsets greater than 16 bits are not yet implemented.
+      // FIXME: The correct range is a 32-bit sign-extended number.
+      if (Res.getConstant() < -0x8000 || Res.getConstant() > 0x7fff) {
+        Error(IDLoc, "macro instruction uses large offset, which is not "
+                     "currently supported");
+        return true;
+      }
+    }
+
+    unsigned TmpReg = DstReg;
+    if (UseSrcReg &&
+        getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg,
+                                                               SrcReg)) {
+      // If $rs is the same as $rd, we need to use AT.
+      // If it is not available we exit.
+      unsigned ATReg = getATReg(IDLoc);
+      if (!ATReg)
+        return true;
+      TmpReg = ATReg;
+    }
+
+    TOut.emitRRX(Mips::LD, TmpReg, ABI.GetGlobalPtr(),
+                 MCOperand::createExpr(GotExpr), IDLoc, STI);
+
+    if (LoExpr)
+      TOut.emitRRX(Mips::DADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr),
+                   IDLoc, STI);
+
+    if (UseSrcReg)
+      TOut.emitRRR(Mips::DADDu, DstReg, TmpReg, SrcReg, IDLoc, STI);
+
+    return false;
+  }
+
   const MipsMCExpr *HiExpr =
       MipsMCExpr::create(MipsMCExpr::MEK_HI, SymExpr, getContext());
   const MipsMCExpr *LoExpr =
diff --git a/lib/Target/Mips/MicroMips64r6InstrInfo.td b/lib/Target/Mips/MicroMips64r6InstrInfo.td
index 6b7f39e9dd79..38b09d105ddd 100644
--- a/lib/Target/Mips/MicroMips64r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips64r6InstrInfo.td
@@ -548,3 +548,15 @@ def : MipsInstAlias<"dnegu $rt, $rs",
 def : MipsInstAlias<"dnegu $rt",
                     (DSUBU_MM64R6 GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rt), 1>,
                     ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dsll $rd, $rt, $rs",
+                    (DSLLV_MM64R6 GPR64Opnd:$rd, GPR64Opnd:$rt,
+                                  GPR32Opnd:$rs), 0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dsrl $rd, $rt, $rs",
+                    (DSRLV_MM64R6 GPR64Opnd:$rd, GPR64Opnd:$rt,
+                                  GPR32Opnd:$rs), 0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dsrl $rd, $rt",
+                    (DSRLV_MM64R6 GPR64Opnd:$rd, GPR64Opnd:$rd,
+                                  GPR32Opnd:$rt), 0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dsll $rd, $rt",
+                    (DSLLV_MM64R6 GPR64Opnd:$rd, GPR64Opnd:$rd,
+                                  GPR32Opnd:$rt), 0>, ISA_MICROMIPS64R6;
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 99025fe1341d..3dba7ce30cad 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -748,9 +748,6 @@ let AdditionalPredicates = [NotInMicroMips] in {
   defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi64, GPR64Opnd, imm64>,
          GPR_64;
 }
-def : MipsInstAlias<"dsll $rd, $rt, $rs",
-                    (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
-                    ISA_MIPS3;
 let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsInstAlias<"dneg $rt, $rs",
                       (DSUB GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rs), 1>,
@@ -793,9 +790,18 @@ def : MipsInstAlias<"dsra $rd, $rt, $rs",
                     (DSRAV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
                     ISA_MIPS3;
 let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"dsll $rd, $rt, $rs",
+                      (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
+                      ISA_MIPS3;
   def : MipsInstAlias<"dsrl $rd, $rt, $rs",
                       (DSRLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
                       ISA_MIPS3;
+  def : MipsInstAlias<"dsrl $rd, $rt",
+                      (DSRLV GPR64Opnd:$rd, GPR64Opnd:$rd, GPR32Opnd:$rt), 0>,
+                      ISA_MIPS3;
+  def : MipsInstAlias<"dsll $rd, $rt",
+                      (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rd, GPR32Opnd:$rt), 0>,
+                      ISA_MIPS3;
 
 // Two operand (implicit 0 selector) versions:
   def : MipsInstAlias<"dmtc0 $rt, $rd",
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 5d82571ff94f..4a34e3101cb8 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -564,7 +564,7 @@ Iter Filler::replaceWithCompactBranch(MachineBasicBlock &MBB, Iter Branch,
 
 // For given opcode returns opcode of corresponding instruction with short
 // delay slot.
-// For the pseudo TAILCALL*_MM instrunctions return the short delay slot
+// For the pseudo TAILCALL*_MM instructions return the short delay slot
 // form. Unfortunately, TAILCALL<->b16 is denied as b16 has a limited range
 // that is too short to make use of for tail calls.
 static int getEquivalentCallShort(int Opcode) {
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 02102d6b22f4..a6ec9fb2e598 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -364,18 +364,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::UDIV, MVT::i64, Expand);
   setOperationAction(ISD::UREM, MVT::i64, Expand);
 
-  if (!(Subtarget.hasDSP() && Subtarget.hasMips32r2())) {
-    setOperationAction(ISD::ADDC, MVT::i32, Expand);
-    setOperationAction(ISD::ADDE, MVT::i32, Expand);
-  }
-
-  setOperationAction(ISD::ADDC, MVT::i64, Expand);
-  setOperationAction(ISD::ADDE, MVT::i64, Expand);
-  setOperationAction(ISD::SUBC, MVT::i32, Expand);
-  setOperationAction(ISD::SUBE, MVT::i32, Expand);
-  setOperationAction(ISD::SUBC, MVT::i64, Expand);
-  setOperationAction(ISD::SUBE, MVT::i64, Expand);
-
   // Operations not directly supported by Mips.
   setOperationAction(ISD::BR_CC,             MVT::f32,   Expand);
   setOperationAction(ISD::BR_CC,             MVT::f64,   Expand);
@@ -481,7 +469,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::ADD);
-  setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::AssertZext);
   setTargetDAGCombine(ISD::SHL);
 
@@ -936,130 +923,14 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
   }
 }
 
-static SDValue performMADD_MSUBCombine(SDNode *ROOTNode, SelectionDAG &CurDAG,
-                                       const MipsSubtarget &Subtarget) {
-  // ROOTNode must have a multiplication as an operand for the match to be
-  // successful.
-  if (ROOTNode->getOperand(0).getOpcode() != ISD::MUL &&
-      ROOTNode->getOperand(1).getOpcode() != ISD::MUL)
-    return SDValue();
-
-  // We don't handle vector types here.
-  if (ROOTNode->getValueType(0).isVector())
-    return SDValue();
-
-  // For MIPS64, madd / msub instructions are inefficent to use with 64 bit
-  // arithmetic. E.g.
-  // (add (mul a b) c) =>
-  //   let res = (madd (mthi (drotr c 32))x(mtlo c) a b) in
-  //   MIPS64:   (or (dsll (mfhi res) 32) (dsrl (dsll (mflo res) 32) 32)
-  //   or
-  //   MIPS64R2: (dins (mflo res) (mfhi res) 32 32)
-  //
-  // The overhead of setting up the Hi/Lo registers and reassembling the
-  // result makes this a dubious optimzation for MIPS64. The core of the
-  // problem is that Hi/Lo contain the upper and lower 32 bits of the
-  // operand and result.
-  //
-  // It requires a chain of 4 add/mul for MIPS64R2 to get better code
-  // density than doing it naively, 5 for MIPS64. Additionally, using
-  // madd/msub on MIPS64 requires the operands actually be 32 bit sign
-  // extended operands, not true 64 bit values.
-  //
-  // FIXME: For the moment, disable this completely for MIPS64.
-  if (Subtarget.hasMips64())
-    return SDValue();
-
-  SDValue Mult = ROOTNode->getOperand(0).getOpcode() == ISD::MUL
-                     ? ROOTNode->getOperand(0)
-                     : ROOTNode->getOperand(1);
-
-  SDValue AddOperand = ROOTNode->getOperand(0).getOpcode() == ISD::MUL
-                     ? ROOTNode->getOperand(1)
-                     : ROOTNode->getOperand(0);
-
-  // Transform this to a MADD only if the user of this node is the add.
-  // If there are other users of the mul, this function returns here.
-  if (!Mult.hasOneUse())
-    return SDValue();
-
-  // maddu and madd are unusual instructions in that on MIPS64 bits 63..31
-  // must be in canonical form, i.e. sign extended. For MIPS32, the operands
-  // of the multiply must have 32 or more sign bits, otherwise we cannot
-  // perform this optimization. We have to check this here as we're performing
-  // this optimization pre-legalization.
-  SDValue MultLHS = Mult->getOperand(0);
-  SDValue MultRHS = Mult->getOperand(1);
-  unsigned LHSSB = CurDAG.ComputeNumSignBits(MultLHS);
-  unsigned RHSSB = CurDAG.ComputeNumSignBits(MultRHS);
-
-  if (LHSSB < 32 || RHSSB < 32)
-    return SDValue();
-
-  APInt HighMask =
-      APInt::getHighBitsSet(Mult->getValueType(0).getScalarSizeInBits(), 32);
-  bool IsUnsigned = CurDAG.MaskedValueIsZero(Mult->getOperand(0), HighMask) &&
-                    CurDAG.MaskedValueIsZero(Mult->getOperand(1), HighMask) &&
-                    CurDAG.MaskedValueIsZero(AddOperand, HighMask);
-
-  // Initialize accumulator.
-  SDLoc DL(ROOTNode);
-  SDValue TopHalf;
-  SDValue BottomHalf;
-  BottomHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand,
-                              CurDAG.getIntPtrConstant(0, DL));
-
-  TopHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand,
-                           CurDAG.getIntPtrConstant(1, DL));
-  SDValue ACCIn = CurDAG.getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
-                                  BottomHalf,
-                                  TopHalf);
-
-  // Create MipsMAdd(u) / MipsMSub(u) node.
-  bool IsAdd = ROOTNode->getOpcode() == ISD::ADD;
-  unsigned Opcode = IsAdd ? (IsUnsigned ? MipsISD::MAddu : MipsISD::MAdd)
-                          : (IsUnsigned ? MipsISD::MSubu : MipsISD::MSub);
-  SDValue MAddOps[3] = {
-      CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(0)),
-      CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(1)), ACCIn};
-  EVT VTs[2] = {MVT::i32, MVT::i32};
-  SDValue MAdd = CurDAG.getNode(Opcode, DL, VTs, MAddOps);
-
-  SDValue ResLo = CurDAG.getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
-  SDValue ResHi = CurDAG.getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
-  SDValue Combined =
-      CurDAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResLo, ResHi);
-  return Combined;
-}
-
-static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const MipsSubtarget &Subtarget) {
-  // (sub v0 (mul v1, v2)) => (msub v1, v2, v0)
-  if (DCI.isBeforeLegalizeOps()) {
-    if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
-        !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64)
-      return performMADD_MSUBCombine(N, DAG, Subtarget);
-
-    return SDValue();
-  }
-
-  return SDValue();
-}
-
 static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const MipsSubtarget &Subtarget) {
-  // (add v0 (mul v1, v2)) => (madd v1, v2, v0)
-  if (DCI.isBeforeLegalizeOps()) {
-    if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
-        !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64)
-      return performMADD_MSUBCombine(N, DAG, Subtarget);
+  // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
 
+  if (DCI.isBeforeLegalizeOps())
     return SDValue();
-  }
 
-  // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
   SDValue Add = N->getOperand(1);
 
   if (Add.getOpcode() != ISD::ADD)
@@ -1187,8 +1058,6 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
     return performAssertZextCombine(N, DAG, DCI, Subtarget);
   case ISD::SHL:
     return performSHLCombine(N, DAG, DCI, Subtarget);
-  case ISD::SUB:
-    return performSUBCombine(N, DAG, DCI, Subtarget);
   }
 
   return SDValue();
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 4be26dd25dc0..49ae6dd4cd39 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -245,64 +245,46 @@ void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
   }
 }
 
-void MipsSEDAGToDAGISel::selectAddE(SDNode *Node, const SDLoc &DL) const {
-  SDValue InFlag = Node->getOperand(2);
-  unsigned Opc = InFlag.getOpcode();
+void MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
+                                        SDValue CmpLHS, const SDLoc &DL,
+                                        SDNode *Node) const {
+  unsigned Opc = InFlag.getOpcode(); (void)Opc;
+
+  assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
+          (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
+         "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
+
+  unsigned SLTuOp = Mips::SLTu, ADDuOp = Mips::ADDu;
+  if (Subtarget->isGP64bit()) {
+    SLTuOp = Mips::SLTu64;
+    ADDuOp = Mips::DADDu;
+  }
+
+  SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
   SDValue LHS = Node->getOperand(0), RHS = Node->getOperand(1);
   EVT VT = LHS.getValueType();
 
-  // In the base case, we can rely on the carry bit from the addsc
-  // instruction.
-  if (Opc == ISD::ADDC) {
-    SDValue Ops[3] = {LHS, RHS, InFlag};
-    CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Ops);
-    return;
+  SDNode *Carry = CurDAG->getMachineNode(SLTuOp, DL, VT, Ops);
+
+  if (Subtarget->isGP64bit()) {
+    // On 64-bit targets, sltu produces an i64 but our backend currently says
+    // that SLTu64 produces an i32. We need to fix this in the long run but for
+    // now, just make the DAG type-correct by asserting the upper bits are zero.
+    Carry = CurDAG->getMachineNode(Mips::SUBREG_TO_REG, DL, VT,
+                                   CurDAG->getTargetConstant(0, DL, VT),
+                                   SDValue(Carry, 0),
+                                   CurDAG->getTargetConstant(Mips::sub_32, DL,
+                                                             VT));
   }
 
-  assert(Opc == ISD::ADDE && "ISD::ADDE not in a chain of ADDE nodes!");
-
-  // The more complex case is when there is a chain of ISD::ADDE nodes like:
-  // (adde (adde (adde (addc a b) c) d) e).
-  //
-  // The addwc instruction does not write to the carry bit, instead it writes
-  // to bit 20 of the dsp control register. To match this series of nodes, each
-  // intermediate adde node must be expanded to write the carry bit before the
-  // addition.
-
-  // Start by reading the overflow field for addsc and moving the value to the
-  // carry field. The usage of 1 here with MipsISD::RDDSP / Mips::WRDSP
-  // corresponds to reading/writing the entire control register to/from a GPR.
-
-  SDValue CstOne = CurDAG->getTargetConstant(1, DL, MVT::i32);
-
-  SDValue OuFlag = CurDAG->getTargetConstant(20, DL, MVT::i32);
-
-  SDNode *DSPCtrlField =
-      CurDAG->getMachineNode(Mips::RDDSP, DL, MVT::i32, MVT::Glue, CstOne, InFlag);
-
-  SDNode *Carry = CurDAG->getMachineNode(
-      Mips::EXT, DL, MVT::i32, SDValue(DSPCtrlField, 0), OuFlag, CstOne);
+  // Generate a second addition only if we know that RHS is not a
+  // constant-zero node.
+  SDNode *AddCarry = Carry;
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS);
+  if (!C || C->getZExtValue())
+    AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT, SDValue(Carry, 0), RHS);
 
-  SDValue Ops[4] = {SDValue(DSPCtrlField, 0),
-                    CurDAG->getTargetConstant(6, DL, MVT::i32), CstOne,
-                    SDValue(Carry, 0)};
-  SDNode *DSPCFWithCarry = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, Ops);
-
-  // My reading of the the MIPS DSP 3.01 specification isn't as clear as I
-  // would like about whether bit 20 always gets overwritten by addwc.
-  // Hence take an extremely conservative view and presume it's sticky. We
-  // therefore need to clear it.
-
-  SDValue Zero = CurDAG->getRegister(Mips::ZERO, MVT::i32);
-
-  SDValue InsOps[4] = {Zero, OuFlag, CstOne, SDValue(DSPCFWithCarry, 0)};
-  SDNode *DSPCtrlFinal = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, InsOps);
-
-  SDNode *WrDSP = CurDAG->getMachineNode(Mips::WRDSP, DL, MVT::Glue,
-                                         SDValue(DSPCtrlFinal, 0), CstOne);
-
-  SDValue Operands[3] = {LHS, RHS, SDValue(WrDSP, 0)};
-  CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Operands);
+  CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0));
 }
 
 /// Match frameindex
@@ -783,8 +765,19 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
   switch(Opcode) {
   default: break;
 
+  case ISD::SUBE: {
+    SDValue InFlag = Node->getOperand(2);
+    unsigned Opc = Subtarget->isGP64bit() ? Mips::DSUBu : Mips::SUBu;
+    selectAddESubE(Opc, InFlag, InFlag.getOperand(0), DL, Node);
+    return true;
+  }
+
   case ISD::ADDE: {
-    selectAddE(Node, DL);
+    if (Subtarget->hasDSP()) // Select DSP instructions, ADDSC and ADDWC.
+      break;
+    SDValue InFlag = Node->getOperand(2);
+    unsigned Opc = Subtarget->isGP64bit() ? Mips::DADDu : Mips::ADDu;
+    selectAddESubE(Opc, InFlag, InFlag.getValue(0), DL, Node);
     return true;
   }
 
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index 6f38289c5a45..f89a350cab04 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -41,7 +41,8 @@ private:
                                            const SDLoc &dl, EVT Ty, bool HasLo,
                                            bool HasHi);
 
-  void selectAddE(SDNode *Node, const SDLoc &DL) const;
+  void selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS,
+                      const SDLoc &DL, SDNode *Node) const;
 
   bool selectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset) const;
   bool selectAddrFrameIndexOffset(SDValue Addr, SDValue &Base, SDValue &Offset,
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index b57bceb3c837..06a97b9d123e 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -179,6 +179,8 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::LOAD,               MVT::i32, Custom);
   setOperationAction(ISD::STORE,              MVT::i32, Custom);
 
+  setTargetDAGCombine(ISD::ADDE);
+  setTargetDAGCombine(ISD::SUBE);
   setTargetDAGCombine(ISD::MUL);
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -419,6 +421,163 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
   return MipsTargetLowering::LowerOperation(Op, DAG);
 }
 
+// selectMADD -
+// Transforms a subgraph in CurDAG if the following pattern is found:
+//  (addc multLo, Lo0), (adde multHi, Hi0),
+// where,
+//  multHi/Lo: product of multiplication
+//  Lo0: initial value of Lo register
+//  Hi0: initial value of Hi register
+// Return true if pattern matching was successful.
+static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) {
+  // ADDENode's second operand must be a flag output of an ADDC node in order
+  // for the matching to be successful.
+  SDNode *ADDCNode = ADDENode->getOperand(2).getNode();
+
+  if (ADDCNode->getOpcode() != ISD::ADDC)
+    return false;
+
+  SDValue MultHi = ADDENode->getOperand(0);
+  SDValue MultLo = ADDCNode->getOperand(0);
+  SDNode *MultNode = MultHi.getNode();
+  unsigned MultOpc = MultHi.getOpcode();
+
+  // MultHi and MultLo must be generated by the same node,
+  if (MultLo.getNode() != MultNode)
+    return false;
+
+  // and it must be a multiplication.
+  if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
+    return false;
+
+  // MultLo amd MultHi must be the first and second output of MultNode
+  // respectively.
+  if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
+    return false;
+
+  // Transform this to a MADD only if ADDENode and ADDCNode are the only users
+  // of the values of MultNode, in which case MultNode will be removed in later
+  // phases.
+  // If there exist users other than ADDENode or ADDCNode, this function returns
+  // here, which will result in MultNode being mapped to a single MULT
+  // instruction node rather than a pair of MULT and MADD instructions being
+  // produced.
+  if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
+    return false;
+
+  SDLoc DL(ADDENode);
+
+  // Initialize accumulator.
+  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
+                                  ADDCNode->getOperand(1),
+                                  ADDENode->getOperand(1));
+
+  // create MipsMAdd(u) node
+  MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MAddu : MipsISD::MAdd;
+
+  SDValue MAdd = CurDAG->getNode(MultOpc, DL, MVT::Untyped,
+                                 MultNode->getOperand(0),// Factor 0
+                                 MultNode->getOperand(1),// Factor 1
+                                 ACCIn);
+
+  // replace uses of adde and addc here
+  if (!SDValue(ADDCNode, 0).use_empty()) {
+    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDCNode, 0), LoOut);
+  }
+  if (!SDValue(ADDENode, 0).use_empty()) {
+    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDENode, 0), HiOut);
+  }
+
+  return true;
+}
+
+// selectMSUB -
+// Transforms a subgraph in CurDAG if the following pattern is found:
+//  (addc Lo0, multLo), (sube Hi0, multHi),
+// where,
+//  multHi/Lo: product of multiplication
+//  Lo0: initial value of Lo register
+//  Hi0: initial value of Hi register
+// Return true if pattern matching was successful.
+static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) {
+  // SUBENode's second operand must be a flag output of an SUBC node in order
+  // for the matching to be successful.
+  SDNode *SUBCNode = SUBENode->getOperand(2).getNode();
+
+  if (SUBCNode->getOpcode() != ISD::SUBC)
+    return false;
+
+  SDValue MultHi = SUBENode->getOperand(1);
+  SDValue MultLo = SUBCNode->getOperand(1);
+  SDNode *MultNode = MultHi.getNode();
+  unsigned MultOpc = MultHi.getOpcode();
+
+  // MultHi and MultLo must be generated by the same node,
+  if (MultLo.getNode() != MultNode)
+    return false;
+
+  // and it must be a multiplication.
+  if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
+    return false;
+
+  // MultLo amd MultHi must be the first and second output of MultNode
+  // respectively.
+  if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
+    return false;
+
+  // Transform this to a MSUB only if SUBENode and SUBCNode are the only users
+  // of the values of MultNode, in which case MultNode will be removed in later
+  // phases.
+  // If there exist users other than SUBENode or SUBCNode, this function returns
+  // here, which will result in MultNode being mapped to a single MULT
+  // instruction node rather than a pair of MULT and MSUB instructions being
+  // produced.
+  if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
+    return false;
+
+  SDLoc DL(SUBENode);
+
+  // Initialize accumulator.
+  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
+                                  SUBCNode->getOperand(0),
+                                  SUBENode->getOperand(0));
+
+  // create MipsSub(u) node
+  MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MSubu : MipsISD::MSub;
+
+  SDValue MSub = CurDAG->getNode(MultOpc, DL, MVT::Glue,
+                                 MultNode->getOperand(0),// Factor 0
+                                 MultNode->getOperand(1),// Factor 1
+                                 ACCIn);
+
+  // replace uses of sube and subc here
+  if (!SDValue(SUBCNode, 0).use_empty()) {
+    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MSub);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBCNode, 0), LoOut);
+  }
+  if (!SDValue(SUBENode, 0).use_empty()) {
+    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MSub);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBENode, 0), HiOut);
+  }
+
+  return true;
+}
+
+static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const MipsSubtarget &Subtarget) {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
+      N->getValueType(0) == MVT::i32 && selectMADD(N, &DAG))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
 // Fold zero extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT
 //
 // Performs the following transformations:
@@ -661,6 +820,19 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const MipsSubtarget &Subtarget) {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  if (Subtarget.hasMips32() && N->getValueType(0) == MVT::i32 &&
+      selectMSUB(N, &DAG))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
 static SDValue genConstMult(SDValue X, uint64_t C, const SDLoc &DL, EVT VT,
                             EVT ShiftTy, SelectionDAG &DAG) {
   // Clear the upper (64 - VT.sizeInBits) bits.
@@ -938,12 +1110,16 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
   SDValue Val;
 
   switch (N->getOpcode()) {
+  case ISD::ADDE:
+    return performADDECombine(N, DAG, DCI, Subtarget);
   case ISD::AND:
     Val = performANDCombine(N, DAG, DCI, Subtarget);
     break;
   case ISD::OR:
     Val = performORCombine(N, DAG, DCI, Subtarget);
     break;
+  case ISD::SUBE:
+    return performSUBECombine(N, DAG, DCI, Subtarget);
   case ISD::MUL:
     return performMULCombine(N, DAG, DCI, this);
   case ISD::SHL:
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index dd7707084948..a64d95512a4a 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -141,9 +141,9 @@ int NVPTXTTIImpl::getArithmeticInstrCost(
   }
 }
 
-void NVPTXTTIImpl::getUnrollingPreferences(Loop *L,
+void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                            TTI::UnrollingPreferences &UP) {
-  BaseT::getUnrollingPreferences(L, UP);
+  BaseT::getUnrollingPreferences(L, SE, UP);
 
   // Enable partial unrolling and runtime unrolling, but reduce the
   // threshold.  This partially unrolls small loops which are often
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 03075b550429..f987892ba675 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -61,7 +61,8 @@ public:
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
       ArrayRef<const Value *> Args = ArrayRef<const Value *>());
 
-  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 6d7eb786a683..7393f3d7a08a 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -131,10 +131,11 @@ public:
     }
   }
 
-  void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
-                         const MCValue &Target, bool &IsResolved) override {
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override {
     switch ((PPC::Fixups)Fixup.getKind()) {
-    default: break;
+    default:
+      return false;
     case PPC::fixup_ppc_br24:
     case PPC::fixup_ppc_br24abs:
       // If the target symbol has a local entry point we must not attempt
@@ -147,10 +148,10 @@ public:
           // and thus the shift to pack it.
           unsigned Other = S->getOther() << 2;
           if ((Other & ELF::STO_PPC64_LOCAL_MASK) != 0)
-            IsResolved = false;
+            return true;
         }
       }
-      break;
+      return false;
     }
   }
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index ae43e59d3cb1..dce443997ea5 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -17,35 +17,31 @@
 namespace llvm {
 namespace PPC {
 enum Fixups {
-  // fixup_ppc_br24 - 24-bit PC relative relocation for direct branches like 'b'
-  // and 'bl'.
+  // 24-bit PC relative relocation for direct branches like 'b' and 'bl'.
   fixup_ppc_br24 = FirstTargetFixupKind,
-  
-  /// fixup_ppc_brcond14 - 14-bit PC relative relocation for conditional
-  /// branches.
+
+  /// 14-bit PC relative relocation for conditional branches.
   fixup_ppc_brcond14,
-  
-  /// fixup_ppc_br24abs - 24-bit absolute relocation for direct branches
-  /// like 'ba' and 'bla'.
+
+  /// 24-bit absolute relocation for direct branches like 'ba' and 'bla'.
   fixup_ppc_br24abs,
 
-  /// fixup_ppc_brcond14abs - 14-bit absolute relocation for conditional
-  /// branches.
+  /// 14-bit absolute relocation for conditional branches.
   fixup_ppc_brcond14abs,
 
-  /// fixup_ppc_half16 - A 16-bit fixup corresponding to lo16(_foo)
-  /// or ha16(_foo) for instrs like 'li' or 'addis'.
+  /// A 16-bit fixup corresponding to lo16(_foo) or ha16(_foo) for instrs like
+  /// 'li' or 'addis'.
   fixup_ppc_half16,
-  
-  /// fixup_ppc_half16ds - A 14-bit fixup corresponding to lo16(_foo) with
-  /// implied 2 zero bits for instrs like 'std'.
+
+  /// A 14-bit fixup corresponding to lo16(_foo) with implied 2 zero bits for
+  /// instrs like 'std'.
   fixup_ppc_half16ds,
 
-  /// fixup_ppc_nofixup - Not a true fixup, but ties a symbol to a call
-  /// to __tls_get_addr for the TLS general and local dynamic models,
-  /// or inserts the thread-pointer register number.
+  /// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the
+  /// TLS general and local dynamic models, or inserts the thread-pointer
+  /// register number.
   fixup_ppc_nofixup,
-  
+
   // Marker
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
index 6d591ca964a6..d5506277ca88 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -219,11 +219,11 @@ bool PPCMachObjectWriter::recordScatteredRelocation(
     const MCSymbol *SB = &B->getSymbol();
 
     if (!SB->getFragment())
-      report_fatal_error("symbol '" + B->getSymbol().getName() +
+      report_fatal_error("symbol '" + SB->getName() +
                          "' can not be undefined in a subtraction expression");
 
     // FIXME: is Type correct? see include/llvm/BinaryFormat/MachO.h
-    Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
+    Value2 = Writer->getSymbolAddress(*SB, Layout);
     FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
   }
   // FIXME: does FixedValue get used??
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index 07c9c1f9f84c..ad92ac8ce120 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_POWERPC_PPC_H
 #define LLVM_LIB_TARGET_POWERPC_PPC_H
 
+#include "llvm/Support/CodeGen.h"
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 
 // GCC #defines PPC on Linux but we use it as our namespace name
@@ -28,7 +29,7 @@ namespace llvm {
   class AsmPrinter;
   class MCInst;
 
-  FunctionPass *createPPCCTRLoops(PPCTargetMachine &TM);
+  FunctionPass *createPPCCTRLoops();
 #ifndef NDEBUG
   FunctionPass *createPPCCTRLoopsVerify();
 #endif
@@ -41,7 +42,7 @@ namespace llvm {
   FunctionPass *createPPCMIPeepholePass();
   FunctionPass *createPPCBranchSelectionPass();
   FunctionPass *createPPCQPXLoadSplatPass();
-  FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
+  FunctionPass *createPPCISelDag(PPCTargetMachine &TM, CodeGenOpt::Level OL);
   FunctionPass *createPPCTLSDynamicCallPass();
   FunctionPass *createPPCBoolRetToIntPass();
   FunctionPass *createPPCExpandISELPass();
@@ -51,6 +52,7 @@ namespace llvm {
   void initializePPCVSXFMAMutatePass(PassRegistry&);
   void initializePPCBoolRetToIntPass(PassRegistry&);
   void initializePPCExpandISELPass(PassRegistry &);
+  void initializePPCTLSDynamicCallPass(PassRegistry &);
   extern char &PPCVSXFMAMutateID;
 
   namespace PPCII {
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index 24bc027f8106..094d3e6a61b5 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -24,12 +24,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPC.h"
+#include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
@@ -81,10 +83,7 @@ namespace {
   public:
     static char ID;
 
-    PPCCTRLoops() : FunctionPass(ID), TM(nullptr) {
-      initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
-    }
-    PPCCTRLoops(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
+    PPCCTRLoops() : FunctionPass(ID) {
       initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
     }
 
@@ -99,16 +98,18 @@ namespace {
     }
 
   private:
-    bool mightUseCTR(const Triple &TT, BasicBlock *BB);
+    bool mightUseCTR(BasicBlock *BB);
     bool convertToCTRLoop(Loop *L);
 
   private:
-    PPCTargetMachine *TM;
+    const PPCTargetMachine *TM;
+    const PPCSubtarget *STI;
+    const PPCTargetLowering *TLI;
+    const DataLayout *DL;
+    const TargetLibraryInfo *LibInfo;
     LoopInfo *LI;
     ScalarEvolution *SE;
-    const DataLayout *DL;
     DominatorTree *DT;
-    const TargetLibraryInfo *LibInfo;
     bool PreserveLCSSA;
   };
 
@@ -149,9 +150,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
                     false, false)
 
-FunctionPass *llvm::createPPCCTRLoops(PPCTargetMachine &TM) {
-  return new PPCCTRLoops(TM);
-}
+FunctionPass *llvm::createPPCCTRLoops() { return new PPCCTRLoops(); }
 
 #ifndef NDEBUG
 INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
@@ -169,6 +168,14 @@ bool PPCCTRLoops::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  TM = &TPC->getTM<PPCTargetMachine>();
+  STI = TM->getSubtargetImpl(F);
+  TLI = STI->getTargetLowering();
+
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -198,8 +205,7 @@ static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) {
 
 // Determining the address of a TLS variable results in a function call in
 // certain TLS models.
-static bool memAddrUsesCTR(const PPCTargetMachine *TM,
-                           const Value *MemAddr) {
+static bool memAddrUsesCTR(const PPCTargetMachine &TM, const Value *MemAddr) {
   const auto *GV = dyn_cast<GlobalValue>(MemAddr);
   if (!GV) {
     // Recurse to check for constants that refer to TLS global variables.
@@ -213,35 +219,35 @@ static bool memAddrUsesCTR(const PPCTargetMachine *TM,
 
   if (!GV->isThreadLocal())
     return false;
-  if (!TM)
-    return true;
-  TLSModel::Model Model = TM->getTLSModel(GV);
+  TLSModel::Model Model = TM.getTLSModel(GV);
   return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic;
 }
 
-bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
+// Loop through the inline asm constraints and look for something that clobbers
+// ctr.
+static bool asmClobbersCTR(InlineAsm *IA) {
+  InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
+  for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
+    InlineAsm::ConstraintInfo &C = CIV[i];
+    if (C.Type != InlineAsm::isInput)
+      for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
+        if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
+          return true;
+  }
+  return false;
+}
+
+bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
   for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
        J != JE; ++J) {
     if (CallInst *CI = dyn_cast<CallInst>(J)) {
+      // Inline ASM is okay, unless it clobbers the ctr register.
       if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
-        // Inline ASM is okay, unless it clobbers the ctr register.
-        InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
-        for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
-          InlineAsm::ConstraintInfo &C = CIV[i];
-          if (C.Type != InlineAsm::isInput)
-            for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
-              if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
-                return true;
-        }
-
+	if (asmClobbersCTR(IA))
+	  return true;
         continue;
       }
 
-      if (!TM)
-        return true;
-      const TargetLowering *TLI =
-          TM->getSubtargetImpl(*BB->getParent())->getTargetLowering();
-
       if (Function *F = CI->getCalledFunction()) {
         // Most intrinsics don't become function calls, but some might.
         // sin, cos, exp and log are always calls.
@@ -380,9 +386,8 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
         }
 
         if (Opcode) {
-          auto &DL = CI->getModule()->getDataLayout();
-          MVT VTy = TLI->getSimpleValueType(DL, CI->getArgOperand(0)->getType(),
-                                            true);
+          MVT VTy = TLI->getSimpleValueType(
+              *DL, CI->getArgOperand(0)->getType(), true);
           if (VTy == MVT::Other)
             return true;
 
@@ -406,17 +411,17 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
       CastInst *CI = cast<CastInst>(J);
       if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
           CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
-          isLargeIntegerTy(TT.isArch32Bit(), CI->getSrcTy()->getScalarType()) ||
-          isLargeIntegerTy(TT.isArch32Bit(), CI->getDestTy()->getScalarType()))
+          isLargeIntegerTy(!TM->isPPC64(), CI->getSrcTy()->getScalarType()) ||
+          isLargeIntegerTy(!TM->isPPC64(), CI->getDestTy()->getScalarType()))
         return true;
-    } else if (isLargeIntegerTy(TT.isArch32Bit(),
+    } else if (isLargeIntegerTy(!TM->isPPC64(),
                                 J->getType()->getScalarType()) &&
                (J->getOpcode() == Instruction::UDiv ||
                 J->getOpcode() == Instruction::SDiv ||
                 J->getOpcode() == Instruction::URem ||
                 J->getOpcode() == Instruction::SRem)) {
       return true;
-    } else if (TT.isArch32Bit() &&
+    } else if (!TM->isPPC64() &&
                isLargeIntegerTy(false, J->getType()->getScalarType()) &&
                (J->getOpcode() == Instruction::Shl ||
                 J->getOpcode() == Instruction::AShr ||
@@ -428,16 +433,11 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
       // On PowerPC, indirect jumps use the counter register.
       return true;
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
-      if (!TM)
-        return true;
-      const TargetLowering *TLI =
-          TM->getSubtargetImpl(*BB->getParent())->getTargetLowering();
-
       if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
         return true;
     }
 
-    if (TM->getSubtargetImpl(*BB->getParent())->getTargetLowering()->useSoftFloat()) {
+    if (STI->useSoftFloat()) {
       switch(J->getOpcode()) {
       case Instruction::FAdd:
       case Instruction::FSub:
@@ -456,7 +456,7 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
     }
 
     for (Value *Operand : J->operands())
-      if (memAddrUsesCTR(TM, Operand))
+      if (memAddrUsesCTR(*TM, Operand))
         return true;
   }
 
@@ -466,11 +466,6 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
 bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   bool MadeChange = false;
 
-  const Triple TT =
-      Triple(L->getHeader()->getParent()->getParent()->getTargetTriple());
-  if (!TT.isArch32Bit() && !TT.isArch64Bit())
-    return MadeChange; // Unknown arch. type.
-
   // Process nested loops first.
   for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
     MadeChange |= convertToCTRLoop(*I);
@@ -495,7 +490,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   // want to use the counter register if the loop contains calls.
   for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
        I != IE; ++I)
-    if (mightUseCTR(TT, *I))
+    if (mightUseCTR(*I))
       return MadeChange;
 
   SmallVector<BasicBlock*, 4> ExitingBlocks;
@@ -517,7 +512,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
     } else if (!SE->isLoopInvariant(EC, L))
       continue;
 
-    if (SE->getTypeSizeInBits(EC->getType()) > (TT.isArch64Bit() ? 64 : 32))
+    if (SE->getTypeSizeInBits(EC->getType()) > (TM->isPPC64() ? 64 : 32))
       continue;
 
     // We now have a loop-invariant count of loop iterations (which is not the
@@ -571,7 +566,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   // preheader, then we can use it (except if the preheader contains a use of
   // the CTR register because some such uses might be reordered by the
   // selection DAG after the mtctr instruction).
-  if (!Preheader || mightUseCTR(TT, Preheader))
+  if (!Preheader || mightUseCTR(Preheader))
     Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
   if (!Preheader)
     return MadeChange;
@@ -582,10 +577,9 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   // selected branch.
   MadeChange = true;
 
-  SCEVExpander SCEVE(*SE, Preheader->getModule()->getDataLayout(), "loopcnt");
+  SCEVExpander SCEVE(*SE, *DL, "loopcnt");
   LLVMContext &C = SE->getContext();
-  Type *CountType = TT.isArch64Bit() ? Type::getInt64Ty(C) :
-                                       Type::getInt32Ty(C);
+  Type *CountType = TM->isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C);
   if (!ExitCount->getType()->isPointerTy() &&
       ExitCount->getType() != CountType)
     ExitCount = SE->getZeroExtendExpr(ExitCount, CountType);
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index afd2e87078a9..535b9deaefac 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -114,8 +114,8 @@ namespace {
     unsigned GlobalBaseReg;
 
   public:
-    explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
-        : SelectionDAGISel(tm), TM(tm) {}
+    explicit PPCDAGToDAGISel(PPCTargetMachine &tm, CodeGenOpt::Level OptLevel)
+        : SelectionDAGISel(tm, OptLevel), TM(tm) {}
 
     bool runOnMachineFunction(MachineFunction &MF) override {
       // Make sure we re-emit a set of the global base reg if necessary
@@ -5116,6 +5116,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
 /// createPPCISelDag - This pass converts a legalized DAG into a
 /// PowerPC-specific DAG, ready for instruction scheduling.
 ///
-FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) {
-  return new PPCDAGToDAGISel(TM);
+FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel) {
+  return new PPCDAGToDAGISel(TM, OptLevel);
 }
diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 31c50785c2ee..5f8085f4626e 100644
--- a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -52,6 +52,7 @@ namespace {
 protected:
     bool processBlock(MachineBasicBlock &MBB) {
       bool Changed = false;
+      bool NeedFence = true;
       bool Is64Bit = MBB.getParent()->getSubtarget<PPCSubtarget>().isPPC64();
 
       for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
@@ -62,6 +63,16 @@ protected:
             MI.getOpcode() != PPC::ADDItlsldLADDR &&
             MI.getOpcode() != PPC::ADDItlsgdLADDR32 &&
             MI.getOpcode() != PPC::ADDItlsldLADDR32) {
+
+          // Although we create ADJCALLSTACKDOWN and ADJCALLSTACKUP
+          // as scheduling fences, we skip creating fences if we already
+          // have existing ADJCALLSTACKDOWN/UP to avoid nesting,
+          // which causes verification error with -verify-machineinstrs.
+          if (MI.getOpcode() == PPC::ADJCALLSTACKDOWN)
+            NeedFence = false;
+          else if (MI.getOpcode() == PPC::ADJCALLSTACKUP)
+            NeedFence = true;
+
           ++I;
           continue;
         }
@@ -96,11 +107,15 @@ protected:
           break;
         }
 
-        // Don't really need to save data to the stack - the clobbered
+        // We create ADJCALLSTACKUP and ADJCALLSTACKDOWN around _tls_get_addr
+        // as schduling fence to avoid it is scheduled before
+        // mflr in the prologue and the address in LR is clobbered (PR25839).
+        // We don't really need to save data to the stack - the clobbered
         // registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr)
         // gets translated to the pseudo instruction (e.g. ADDItlsgdLADDR).
-        BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0)
-                                                            .addImm(0);
+        if (NeedFence)
+          BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0)
+                                                              .addImm(0);
 
         // Expand into two ops built prior to the existing instruction.
         MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3)
@@ -116,7 +131,8 @@ protected:
                               .addReg(GPR3));
         Call->addOperand(MI.getOperand(3));
 
-        BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0);
+        if (NeedFence)
+          BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0);
 
         BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), OutReg)
           .addReg(GPR3);
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index a88a6541e8d0..fe092cc3b858 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -93,6 +93,7 @@ extern "C" void LLVMInitializePowerPCTarget() {
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializePPCBoolRetToIntPass(PR);
   initializePPCExpandISELPass(PR);
+  initializePPCTLSDynamicCallPass(PR);
 }
 
 /// Return the datalayout string of a subtarget.
@@ -336,7 +337,7 @@ bool PPCPassConfig::addPreISel() {
     addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine()));
 
   if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
-    addPass(createPPCCTRLoops(getPPCTargetMachine()));
+    addPass(createPPCCTRLoops());
 
   return false;
 }
@@ -352,7 +353,7 @@ bool PPCPassConfig::addILPOpts() {
 
 bool PPCPassConfig::addInstSelector() {
   // Install an instruction selector.
-  addPass(createPPCISelDag(getPPCTargetMachine()));
+  addPass(createPPCISelDag(getPPCTargetMachine(), getOptLevel()));
 
 #ifndef NDEBUG
   if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 5eb6ba785d1b..2dc3828334ac 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -41,6 +41,7 @@ public:
   ~PPCTargetMachine() override;
 
   const PPCSubtarget *getSubtargetImpl(const Function &F) const override;
+  const PPCSubtarget *getSubtargetImpl() const = delete;
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 3dbd5f5b9a92..6110706b01b9 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -189,7 +189,7 @@ int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
   return PPCTTIImpl::getIntImmCost(Imm, Ty);
 }
 
-void PPCTTIImpl::getUnrollingPreferences(Loop *L,
+void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                          TTI::UnrollingPreferences &UP) {
   if (ST->getDarwinDirective() == PPC::DIR_A2) {
     // The A2 is in-order with a deep pipeline, and concatenation unrolling
@@ -201,7 +201,7 @@ void PPCTTIImpl::getUnrollingPreferences(Loop *L,
     UP.AllowExpensiveTripCount = true;
   }
 
-  BaseT::getUnrollingPreferences(L, UP);
+  BaseT::getUnrollingPreferences(L, SE, UP);
 }
 
 bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 758c335def08..99ca6394d1be 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -52,7 +52,8 @@ public:
                     Type *Ty);
 
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
-  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
 
   /// @}
 
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index c72b47b09085..d4454c271f5a 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -203,13 +203,14 @@ namespace {
       return InfosBE[Kind - FirstTargetFixupKind];
     }
 
-    void processFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
-                           const MCValue &Target, bool &IsResolved) override {
+    bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                               const MCValue &Target) override {
       switch ((Sparc::Fixups)Fixup.getKind()) {
-      default: break;
+      default:
+        return false;
       case Sparc::fixup_sparc_wplt30:
         if (Target.getSymA()->getSymbol().isTemporary())
-          return;
+          return false;
       case Sparc::fixup_sparc_tls_gd_hi22:
       case Sparc::fixup_sparc_tls_gd_lo10:
       case Sparc::fixup_sparc_tls_gd_add:
@@ -227,7 +228,8 @@ namespace {
       case Sparc::fixup_sparc_tls_ie_ldx:
       case Sparc::fixup_sparc_tls_ie_add:
       case Sparc::fixup_sparc_tls_le_hix22:
-      case Sparc::fixup_sparc_tls_le_lox10:  IsResolved = false; break;
+      case Sparc::fixup_sparc_tls_le_lox10:
+        return true;
       }
     }
 
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index ad05779a9f64..ee23692ad1db 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -61,6 +61,7 @@ enum RegisterKind {
   VR64Reg,
   VR128Reg,
   AR32Reg,
+  CR64Reg,
 };
 
 enum MemoryKind {
@@ -343,6 +344,7 @@ public:
   bool isVF128() const { return false; }
   bool isVR128() const { return isReg(VR128Reg); }
   bool isAR32() const { return isReg(AR32Reg); }
+  bool isCR64() const { return isReg(CR64Reg); }
   bool isAnyReg() const { return (isReg() || isImm(0, 15)); }
   bool isBDAddr32Disp12() const { return isMemDisp12(BDMem, ADDR32Reg); }
   bool isBDAddr32Disp20() const { return isMemDisp20(BDMem, ADDR32Reg); }
@@ -379,7 +381,8 @@ private:
     RegGR,
     RegFP,
     RegV,
-    RegAR
+    RegAR,
+    RegCR
   };
   struct Register {
     RegisterGroup Group;
@@ -487,6 +490,9 @@ public:
   OperandMatchResultTy parseAR32(OperandVector &Operands) {
     return parseRegister(Operands, RegAR, SystemZMC::AR32Regs, AR32Reg);
   }
+  OperandMatchResultTy parseCR64(OperandVector &Operands) {
+    return parseRegister(Operands, RegCR, SystemZMC::CR64Regs, CR64Reg);
+  }
   OperandMatchResultTy parseAnyReg(OperandVector &Operands) {
     return parseAnyRegister(Operands);
   }
@@ -648,6 +654,8 @@ bool SystemZAsmParser::parseRegister(Register &Reg) {
     Reg.Group = RegV;
   else if (Prefix == 'a' && Reg.Num < 16)
     Reg.Group = RegAR;
+  else if (Prefix == 'c' && Reg.Num < 16)
+    Reg.Group = RegCR;
   else
     return Error(Reg.StartLoc, "invalid register");
 
@@ -741,6 +749,10 @@ SystemZAsmParser::parseAnyRegister(OperandVector &Operands) {
       Kind = AR32Reg;
       RegNo = SystemZMC::AR32Regs[Reg.Num];
     }
+    else if (Reg.Group == RegCR) {
+      Kind = CR64Reg;
+      RegNo = SystemZMC::CR64Regs[Reg.Num];
+    }
     else {
       return MatchOperand_ParseFail;
     }
@@ -1056,6 +1068,8 @@ bool SystemZAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
     RegNo = SystemZMC::VR128Regs[Reg.Num];
   else if (Reg.Group == RegAR)
     RegNo = SystemZMC::AR32Regs[Reg.Num];
+  else if (Reg.Group == RegCR)
+    RegNo = SystemZMC::CR64Regs[Reg.Num];
   StartLoc = Reg.StartLoc;
   EndLoc = Reg.EndLoc;
   return false;
diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index 27fd70bc6092..8903b57ffd0b 100644
--- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -162,6 +162,12 @@ static DecodeStatus DecodeAR32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
   return decodeRegisterClass(Inst, RegNo, SystemZMC::AR32Regs, 16);
 }
 
+static DecodeStatus DecodeCR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::CR64Regs, 16);
+}
+
 template<unsigned N>
 static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm) {
   if (!isUInt<N>(Imm))
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index dfea7e33fa15..727ab921daf9 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -116,6 +116,13 @@ const unsigned SystemZMC::AR32Regs[16] = {
   SystemZ::A12, SystemZ::A13, SystemZ::A14, SystemZ::A15
 };
 
+const unsigned SystemZMC::CR64Regs[16] = {
+  SystemZ::C0, SystemZ::C1, SystemZ::C2, SystemZ::C3,
+  SystemZ::C4, SystemZ::C5, SystemZ::C6, SystemZ::C7,
+  SystemZ::C8, SystemZ::C9, SystemZ::C10, SystemZ::C11,
+  SystemZ::C12, SystemZ::C13, SystemZ::C14, SystemZ::C15
+};
+
 unsigned SystemZMC::getFirstReg(unsigned Reg) {
   static unsigned Map[SystemZ::NUM_TARGET_REGS];
   static bool Initialized = false;
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
index d9926c7e4986..dbca3485290a 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -55,6 +55,7 @@ extern const unsigned VR32Regs[32];
 extern const unsigned VR64Regs[32];
 extern const unsigned VR128Regs[32];
 extern const unsigned AR32Regs[16];
+extern const unsigned CR64Regs[16];
 
 // Return the 0-based number of the first architectural register that
 // contains the given LLVM register.   E.g. R1D -> 1.
diff --git a/lib/Target/SystemZ/README.txt b/lib/Target/SystemZ/README.txt
index 74cf653b9d95..9b714157550d 100644
--- a/lib/Target/SystemZ/README.txt
+++ b/lib/Target/SystemZ/README.txt
@@ -67,6 +67,11 @@ We don't use ICM, STCM, or CLM.
 
 --
 
+We don't use ADD (LOGICAL) HIGH, SUBTRACT (LOGICAL) HIGH,
+or COMPARE (LOGICAL) HIGH yet.
+
+--
+
 DAGCombiner doesn't yet fold truncations of extended loads.  Functions like:
 
     unsigned long f (unsigned long x, unsigned short *y)
diff --git a/lib/Target/SystemZ/SystemZ.td b/lib/Target/SystemZ/SystemZ.td
index c5f324418da5..41300a1b6295 100644
--- a/lib/Target/SystemZ/SystemZ.td
+++ b/lib/Target/SystemZ/SystemZ.td
@@ -56,6 +56,7 @@ include "SystemZInstrVector.td"
 include "SystemZInstrFP.td"
 include "SystemZInstrHFP.td"
 include "SystemZInstrDFP.td"
+include "SystemZInstrSystem.td"
 
 def SystemZInstrInfo : InstrInfo {}
 
diff --git a/lib/Target/SystemZ/SystemZFeatures.td b/lib/Target/SystemZ/SystemZFeatures.td
index ffb0b8d1c861..c5faa0d62881 100644
--- a/lib/Target/SystemZ/SystemZFeatures.td
+++ b/lib/Target/SystemZ/SystemZFeatures.td
@@ -68,11 +68,21 @@ def FeaturePopulationCount : SystemZFeature<
   "Assume that the population-count facility is installed"
 >;
 
+def FeatureMessageSecurityAssist3 : SystemZFeature<
+  "message-security-assist-extension3", "MessageSecurityAssist3",
+  "Assume that the message-security-assist extension facility 3 is installed"
+>;
+
 def FeatureMessageSecurityAssist4 : SystemZFeature<
   "message-security-assist-extension4", "MessageSecurityAssist4",
   "Assume that the message-security-assist extension facility 4 is installed"
 >;
 
+def FeatureResetReferenceBitsMultiple : SystemZFeature<
+  "reset-reference-bits-multiple", "ResetReferenceBitsMultiple",
+  "Assume that the reset-reference-bits-multiple facility is installed"
+>;
+
 def Arch9NewFeatures : SystemZFeatureList<[
     FeatureDistinctOps,
     FeatureFastSerialization,
@@ -81,7 +91,9 @@ def Arch9NewFeatures : SystemZFeatureList<[
     FeatureInterlockedAccess1,
     FeatureLoadStoreOnCond,
     FeaturePopulationCount,
-    FeatureMessageSecurityAssist4
+    FeatureMessageSecurityAssist3,
+    FeatureMessageSecurityAssist4,
+    FeatureResetReferenceBitsMultiple
 ]>;
 
 //===----------------------------------------------------------------------===//
@@ -120,13 +132,19 @@ def FeatureDFPZonedConversion : SystemZFeature<
   "Assume that the DFP zoned-conversion facility is installed"
 >;
 
+def FeatureEnhancedDAT2 : SystemZFeature<
+  "enhanced-dat-2", "EnhancedDAT2",
+  "Assume that the enhanced-DAT facility 2 is installed"
+>;
+
 def Arch10NewFeatures : SystemZFeatureList<[
     FeatureExecutionHint,
     FeatureLoadAndTrap,
     FeatureMiscellaneousExtensions,
     FeatureProcessorAssist,
     FeatureTransactionalExecution,
-    FeatureDFPZonedConversion
+    FeatureDFPZonedConversion,
+    FeatureEnhancedDAT2
 ]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index 5f6115ed86a4..7620e06ccbc9 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -2468,6 +2468,14 @@ class UnaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let OpType = "reg";
 }
 
+class UnaryTiedRRE<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstRRE<opcode, (outs cls:$R1), (ins cls:$R1src),
+            mnemonic#"\t$R1", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let R2 = 0;
+}
+
 class UnaryMemRRFc<string mnemonic, bits<16> opcode,
                    RegisterOperand cls1, RegisterOperand cls2>
   : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src),
@@ -2702,6 +2710,26 @@ class SideEffectBinaryRILPC<string mnemonic, bits<12> opcode,
   let AddedComplexity = 7;
 }
 
+class SideEffectBinaryRRE<string mnemonic, bits<16> opcode,
+                          RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+            mnemonic#"\t$R1, $R2", []>;
+
+class SideEffectBinaryRRFa<string mnemonic, bits<16> opcode,
+                           RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRFa<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+             mnemonic#"\t$R1, $R2", []> {
+  let R3 = 0;
+  let M4 = 0;
+}
+
+class SideEffectBinaryRRFc<string mnemonic, bits<16> opcode,
+                           RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRFc<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+             mnemonic#"\t$R1, $R2", []> {
+  let M3 = 0;
+}
+
 class SideEffectBinaryIE<string mnemonic, bits<16> opcode,
                          Immediate imm1, Immediate imm2>
   : InstIE<opcode, (outs), (ins imm1:$I1, imm2:$I2),
@@ -2729,6 +2757,10 @@ class SideEffectBinarySSf<string mnemonic, bits<8> opcode>
   : InstSSf<opcode, (outs), (ins bdaddr12only:$BD1, bdladdr12onlylen8:$BDL2),
             mnemonic##"\t$BD1, $BDL2", []>;
 
+class SideEffectBinarySSE<string mnemonic, bits<16> opcode>
+  : InstSSE<opcode, (outs), (ins bdaddr12only:$BD1, bdaddr12only:$BD2),
+            mnemonic#"\t$BD1, $BD2", []>;
+
 class SideEffectBinaryMemMemRR<string mnemonic, bits<8> opcode,
                                RegisterOperand cls1, RegisterOperand cls2>
   : InstRR<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
@@ -3612,6 +3644,22 @@ class SideEffectTernarySSc<string mnemonic, bits<8> opcode>
                                  shift12only:$BD2, imm32zx4:$I3),
             mnemonic##"\t$BDL1, $BD2, $I3", []>;
 
+class SideEffectTernaryRRFa<string mnemonic, bits<16> opcode,
+                            RegisterOperand cls1, RegisterOperand cls2,
+                            RegisterOperand cls3>
+  : InstRRFa<opcode, (outs), (ins cls1:$R1, cls2:$R2, cls3:$R3),
+             mnemonic#"\t$R1, $R2, $R3", []> {
+  let M4 = 0;
+}
+
+class SideEffectTernaryRRFb<string mnemonic, bits<16> opcode,
+                            RegisterOperand cls1, RegisterOperand cls2,
+                            RegisterOperand cls3>
+  : InstRRFb<opcode, (outs), (ins cls1:$R1, cls2:$R2, cls3:$R3),
+             mnemonic#"\t$R1, $R3, $R2", []> {
+  let M4 = 0;
+}
+
 class SideEffectTernaryMemMemMemRRFb<string mnemonic, bits<16> opcode,
                                      RegisterOperand cls1,
                                      RegisterOperand cls2,
@@ -3630,6 +3678,13 @@ class SideEffectTernaryRRFc<string mnemonic, bits<16> opcode,
   : InstRRFc<opcode, (outs), (ins cls1:$R1, cls2:$R2, imm:$M3),
              mnemonic#"\t$R1, $R2, $M3", []>;
 
+multiclass SideEffectTernaryRRFcOpt<string mnemonic, bits<16> opcode,
+                                    RegisterOperand cls1,
+                                    RegisterOperand cls2> {
+  def "" : SideEffectTernaryRRFc<mnemonic, opcode, cls1, cls2, imm32zx4>;
+  def Opt : SideEffectBinaryRRFc<mnemonic, opcode, cls1, cls2>;
+}
+
 class SideEffectTernaryMemMemRRFc<string mnemonic, bits<16> opcode,
                                   RegisterOperand cls1, RegisterOperand cls2,
                                   Immediate imm>
@@ -3720,6 +3775,18 @@ multiclass TernaryRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
   }
 }
 
+class SideEffectTernaryRS<string mnemonic, bits<8> opcode,
+                          RegisterOperand cls1, RegisterOperand cls2>
+  : InstRSa<opcode, (outs),
+            (ins cls1:$R1, cls2:$R3, bdaddr12only:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2", []>;
+
+class SideEffectTernaryRSY<string mnemonic, bits<16> opcode,
+                           RegisterOperand cls1, RegisterOperand cls2>
+  : InstRSYa<opcode, (outs),
+             (ins cls1:$R1, cls2:$R3, bdaddr20only:$BD2),
+             mnemonic#"\t$R1, $R3, $BD2", []>;
+
 class SideEffectTernaryMemMemRS<string mnemonic, bits<8> opcode,
                                 RegisterOperand cls1, RegisterOperand cls2>
   : InstRSa<opcode, (outs cls1:$R1, cls2:$R3),
@@ -3997,6 +4064,35 @@ multiclass QuaternaryOptVRRdSPairGeneric<string mnemonic, bits<16> opcode> {
                                             VR128:$V4, imm32zx4:$M5, 0)>;
 }
 
+class SideEffectQuaternaryRRFa<string mnemonic, bits<16> opcode,
+                               RegisterOperand cls1, RegisterOperand cls2,
+                               RegisterOperand cls3>
+  : InstRRFa<opcode, (outs), (ins cls1:$R1, cls2:$R2, cls3:$R3, imm32zx4:$M4),
+             mnemonic#"\t$R1, $R2, $R3, $M4", []>;
+
+multiclass SideEffectQuaternaryRRFaOptOpt<string mnemonic, bits<16> opcode,
+                                          RegisterOperand cls1,
+                                          RegisterOperand cls2,
+                                          RegisterOperand cls3> {
+  def "" : SideEffectQuaternaryRRFa<mnemonic, opcode, cls1, cls2, cls3>;
+  def Opt : SideEffectTernaryRRFa<mnemonic, opcode, cls1, cls2, cls3>;
+  def OptOpt : SideEffectBinaryRRFa<mnemonic, opcode, cls1, cls2>;
+}
+
+class SideEffectQuaternaryRRFb<string mnemonic, bits<16> opcode,
+                               RegisterOperand cls1, RegisterOperand cls2,
+                               RegisterOperand cls3>
+  : InstRRFb<opcode, (outs), (ins cls1:$R1, cls2:$R2, cls3:$R3, imm32zx4:$M4),
+             mnemonic#"\t$R1, $R3, $R2, $M4", []>;
+
+multiclass SideEffectQuaternaryRRFbOpt<string mnemonic, bits<16> opcode,
+                                       RegisterOperand cls1,
+                                       RegisterOperand cls2,
+                                       RegisterOperand cls3> {
+  def "" : SideEffectQuaternaryRRFb<mnemonic, opcode, cls1, cls2, cls3>;
+  def Opt : SideEffectTernaryRRFb<mnemonic, opcode, cls1, cls2, cls3>;
+}
+
 class SideEffectQuaternarySSe<string mnemonic, bits<8> opcode,
                               RegisterOperand cls>
   : InstSSe<opcode, (outs),
@@ -4012,6 +4108,16 @@ class LoadAndOpRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let mayStore = 1;
 }
 
+class CmpSwapRRE<string mnemonic, bits<16> opcode,
+                 RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
+            mnemonic#"\t$R1, $R2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
 class CmpSwapRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                 RegisterOperand cls, AddressingMode mode = bdaddr12only>
   : InstRSa<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, mode:$BD2),
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index 9f5e6288348e..98f66c29ae64 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -883,6 +883,12 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   }
   def AGFR : BinaryRRE<"agfr", 0xB918, null_frag, GR64, GR32>;
 
+  // Addition to a high register.
+  def AHHHR : BinaryRRFa<"ahhhr", 0xB9C8, null_frag, GRH32, GRH32, GRH32>,
+              Requires<[FeatureHighWord]>;
+  def AHHLR : BinaryRRFa<"ahhlr", 0xB9D8, null_frag, GRH32, GRH32, GR32>,
+              Requires<[FeatureHighWord]>;
+
   // Addition of signed 16-bit immediates.
   defm AHIMux : BinaryRIAndKPseudo<"ahimux", add, GRX32, imm32sx16>;
   defm AHI  : BinaryRIAndK<"ahi",  0xA7A, 0xECD8, add, GR32, imm32sx16>;
@@ -917,6 +923,12 @@ let Defs = [CC] in {
   }
   def ALGFR : BinaryRRE<"algfr", 0xB91A, null_frag, GR64, GR32>;
 
+  // Addition to a high register.
+  def ALHHHR : BinaryRRFa<"alhhhr", 0xB9CA, null_frag, GRH32, GRH32, GRH32>,
+               Requires<[FeatureHighWord]>;
+  def ALHHLR : BinaryRRFa<"alhhlr", 0xB9DA, null_frag, GRH32, GRH32, GR32>,
+               Requires<[FeatureHighWord]>;
+
   // Addition of signed 16-bit immediates.
   def ALHSIK  : BinaryRIE<"alhsik",  0xECDA, addc, GR32, imm32sx16>,
                 Requires<[FeatureDistinctOps]>;
@@ -927,6 +939,10 @@ let Defs = [CC] in {
   def ALFI  : BinaryRIL<"alfi",  0xC2B, addc, GR32, uimm32>;
   def ALGFI : BinaryRIL<"algfi", 0xC2A, addc, GR64, imm64zx32>;
 
+  // Addition of signed 32-bit immediates.
+  def ALSIH : BinaryRIL<"alsih", 0xCCA, null_frag, GRH32, simm32>,
+              Requires<[FeatureHighWord]>;
+
   // Addition of memory.
   defm AL   : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load, 4>;
   def  ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, azextloadi32, 4>;
@@ -949,6 +965,10 @@ let Defs = [CC], Uses = [CC] in {
   def ALCG : BinaryRXY<"alcg", 0xE388, adde, GR64, load, 8>;
 }
 
+// Addition that does not modify the condition code.
+def ALSIHN : BinaryRIL<"alsihn", 0xCCB, null_frag, GRH32, simm32>,
+             Requires<[FeatureHighWord]>;
+
 //===----------------------------------------------------------------------===//
 // Subtraction
 //===----------------------------------------------------------------------===//
@@ -961,6 +981,12 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   def SGFR : BinaryRRE<"sgfr", 0xB919, null_frag, GR64, GR32>;
   defm SGR : BinaryRREAndK<"sgr", 0xB909, 0xB9E9, sub, GR64, GR64>;
 
+  // Subtraction from a high register.
+  def SHHHR : BinaryRRFa<"shhhr", 0xB9C9, null_frag, GRH32, GRH32, GRH32>,
+              Requires<[FeatureHighWord]>;
+  def SHHLR : BinaryRRFa<"shhlr", 0xB9D9, null_frag, GRH32, GRH32, GR32>,
+              Requires<[FeatureHighWord]>;
+
   // Subtraction of memory.
   defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, asextloadi16, 2>;
   defm S   : BinaryRXPair<"s", 0x5B, 0xE35B, sub, GR32, load, 4>;
@@ -976,6 +1002,12 @@ let Defs = [CC] in {
   def SLGFR : BinaryRRE<"slgfr", 0xB91B, null_frag, GR64, GR32>;
   defm SLGR : BinaryRREAndK<"slgr", 0xB90B, 0xB9EB, subc, GR64, GR64>;
 
+  // Subtraction from a high register.
+  def SLHHHR : BinaryRRFa<"slhhhr", 0xB9CB, null_frag, GRH32, GRH32, GRH32>,
+               Requires<[FeatureHighWord]>;
+  def SLHHLR : BinaryRRFa<"slhhlr", 0xB9DB, null_frag, GRH32, GRH32, GR32>,
+               Requires<[FeatureHighWord]>;
+
   // Subtraction of unsigned 32-bit immediates.  These don't match
   // subc because we prefer addc for constants.
   def SLFI  : BinaryRIL<"slfi",  0xC25, null_frag, GR32, uimm32>;
@@ -1298,6 +1330,12 @@ let Defs = [CC], CCValues = 0xE in {
   def CGFR : CompareRRE<"cgfr", 0xB930, null_frag, GR64, GR32>;
   def CGR  : CompareRRE<"cgr",  0xB920, z_scmp,    GR64, GR64>;
 
+  // Comparison with a high register.
+  def CHHR : CompareRRE<"chhr", 0xB9CD, null_frag, GRH32, GRH32>,
+             Requires<[FeatureHighWord]>;
+  def CHLR : CompareRRE<"chlr", 0xB9DD, null_frag, GRH32, GR32>,
+             Requires<[FeatureHighWord]>;
+
   // Comparison with a signed 16-bit immediate.  CHIMux expands to CHI or CIH,
   // depending on the choice of register.
   def CHIMux : CompareRIPseudo<z_scmp, GRX32, imm32sx16>,
@@ -1344,6 +1382,12 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
   def CLGFR : CompareRRE<"clgfr", 0xB931, null_frag, GR64, GR32>;
   def CLGR  : CompareRRE<"clgr",  0xB921, z_ucmp,    GR64, GR64>;
 
+  // Comparison with a high register.
+  def CLHHR : CompareRRE<"clhhr", 0xB9CF, null_frag, GRH32, GRH32>,
+              Requires<[FeatureHighWord]>;
+  def CLHLR : CompareRRE<"clhlr", 0xB9DF, null_frag, GRH32, GR32>,
+              Requires<[FeatureHighWord]>;
+
   // Comparison with an unsigned 32-bit immediate.  CLFIMux expands to CLFI
   // or CLIH, depending on the choice of register.
   def CLFIMux : CompareRIPseudo<z_ucmp, GRX32, uimm32>,
@@ -1888,54 +1932,12 @@ let mayLoad = 1, Defs = [CC] in
 let mayLoad = 1, mayStore = 1, Defs = [CC, R1D], Uses = [R0L, R1D] in
   def CMPSC : SideEffectBinaryMemMemRRE<"cmpsc", 0xB263, GR128, GR128>;
 
-// Supervisor call.
-let hasSideEffects = 1, isCall = 1, Defs = [CC] in
-  def SVC : SideEffectUnaryI<"svc", 0x0A, imm32zx8>;
-
-// Monitor call.
-let hasSideEffects = 1, isCall = 1 in
-  def MC : SideEffectBinarySI<"mc", 0xAF, imm32zx8>;
-
-// Store clock.
-let hasSideEffects = 1, Defs = [CC] in {
-  def STCK  : StoreInherentS<"stck",  0xB205, null_frag, 8>;
-  def STCKF : StoreInherentS<"stckf", 0xB27C, null_frag, 8>;
-  def STCKE : StoreInherentS<"stcke", 0xB278, null_frag, 16>;
-}
-
-// Store facility list.
-let hasSideEffects = 1, Uses = [R0D], Defs = [R0D, CC] in
-  def STFLE : StoreInherentS<"stfle", 0xB2B0, null_frag, 0>;
-
-// Extract CPU attribute.
-let hasSideEffects = 1 in
-  def ECAG : BinaryRSY<"ecag", 0xEB4C, null_frag, GR64>;
-
-// Extract CPU time.
-let Defs = [R0D, R1D], hasSideEffects = 1, mayLoad = 1 in
-  def ECTG : SideEffectTernarySSF<"ectg", 0xC81, GR64>;
-
-// Extract PSW.
-let hasSideEffects = 1, Uses = [CC] in
-  def EPSW : InherentDualRRE<"epsw", 0xB98D, GR32>;
-
 // Execute.
 let hasSideEffects = 1 in {
   def EX   : SideEffectBinaryRX<"ex", 0x44, GR64>;
   def EXRL : SideEffectBinaryRILPC<"exrl", 0xC60, GR64>;
 }
 
-// Program return.
-let hasSideEffects = 1, Defs = [CC] in
-  def PR : SideEffectInherentE<"pr", 0x0101>;
-
-// Move with key.
-let mayLoad = 1, mayStore = 1, Defs = [CC] in
-  def MVCK : MemoryBinarySSd<"mvck", 0xD9, GR64>;
-
-// Store real address.
-def STRAG : StoreSSE<"strag", 0xE502>;
-
 //===----------------------------------------------------------------------===//
 // .insn directive instructions
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZInstrSystem.td b/lib/Target/SystemZ/SystemZInstrSystem.td
new file mode 100644
index 000000000000..a9803c2d83e9
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrSystem.td
@@ -0,0 +1,517 @@
+//==- SystemZInstrSystem.td - SystemZ system instructions -*- tblgen-*-----==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The instructions in this file implement SystemZ system-level instructions.
+// Most of these instructions are privileged or semi-privileged.  They are
+// not used for code generation, but are provided for use with the assembler
+// and disassembler only.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Program-Status Word Instructions.
+//===----------------------------------------------------------------------===//
+
+// Extract PSW.
+let hasSideEffects = 1, Uses = [CC] in
+  def EPSW : InherentDualRRE<"epsw", 0xB98D, GR32>;
+
+// Load PSW (extended).
+let hasSideEffects = 1, Defs = [CC], mayLoad = 1 in {
+  def LPSW : SideEffectUnaryS<"lpsw", 0x8200, null_frag, 8>;
+  def LPSWE : SideEffectUnaryS<"lpswe", 0xB2B2, null_frag, 16>;
+}
+
+// Insert PSW key.
+let Uses = [R2L], Defs = [R2L] in
+  def IPK : SideEffectInherentS<"ipk", 0xB20B, null_frag>;
+
+// Set PSW key from address.
+let hasSideEffects = 1 in
+  def SPKA : SideEffectAddressS<"spka", 0xB20A, null_frag>;
+
+// Set system mask.
+let hasSideEffects = 1, mayLoad = 1 in
+  def SSM : SideEffectUnaryS<"ssm", 0x8000, null_frag, 1>;
+
+// Store then AND/OR system mask.
+let hasSideEffects = 1 in {
+  def STNSM : StoreSI<"stnsm", 0xAC, null_frag, imm32zx8>;
+  def STOSM : StoreSI<"stosm", 0xAD, null_frag, imm32zx8>;
+}
+
+// Insert address space control.
+let hasSideEffects = 1 in
+  def IAC : InherentRRE<"iac", 0xB224, GR32, null_frag>;
+
+// Set address space control (fast).
+let hasSideEffects = 1 in {
+  def SAC : SideEffectAddressS<"sac", 0xB219, null_frag>;
+  def SACF : SideEffectAddressS<"sacf", 0xB279, null_frag>;
+}
+
+//===----------------------------------------------------------------------===//
+// Control Register Instructions.
+//===----------------------------------------------------------------------===//
+
+// Load control.
+def LCTL : LoadMultipleRS<"lctl", 0xB7, CR64>;
+def LCTLG : LoadMultipleRSY<"lctlg", 0xEB2F, CR64>;
+
+// Store control.
+def STCTL : StoreMultipleRS<"stctl", 0xB6, CR64>;
+def STCTG : StoreMultipleRSY<"stctg", 0xEB25, CR64>;
+
+// Extract primary ASN (and instance).
+let hasSideEffects = 1 in {
+  def EPAR : InherentRRE<"epar", 0xB226, GR32, null_frag>;
+  def EPAIR : InherentRRE<"epair", 0xB99A, GR64, null_frag>;
+}
+
+// Extract secondary ASN (and instance).
+let hasSideEffects = 1 in {
+  def ESAR : InherentRRE<"esar", 0xB227, GR32, null_frag>;
+  def ESAIR : InherentRRE<"esair", 0xB99B, GR64, null_frag>;
+}
+
+// Set secondary ASN (and instance).
+let hasSideEffects = 1 in {
+  def SSAR : SideEffectUnaryRRE<"ssar", 0xB225, GR32, null_frag>;
+  def SSAIR : SideEffectUnaryRRE<"ssair", 0xB99F, GR64, null_frag>;
+}
+
+// Extract and set extended authority.
+let hasSideEffects = 1 in
+  def ESEA : UnaryTiedRRE<"esea", 0xB99D, GR32>;
+
+//===----------------------------------------------------------------------===//
+// Prefix-Register Instructions.
+//===----------------------------------------------------------------------===//
+
+// Set prefix.
+let hasSideEffects = 1 in
+  def SPX : SideEffectUnaryS<"spx", 0xB210, null_frag, 4>;
+
+// Store prefix.
+let hasSideEffects = 1 in
+  def STPX : StoreInherentS<"stpx", 0xB211, null_frag, 4>;
+
+//===----------------------------------------------------------------------===//
+// Storage-Key and Real Memory Instructions.
+//===----------------------------------------------------------------------===//
+
+// Insert storage key extended.
+let hasSideEffects = 1 in
+  def ISKE : BinaryRRE<"iske", 0xB229, null_frag, GR32, GR64>;
+
+// Insert virtual storage key.
+let hasSideEffects = 1 in
+  def IVSK : BinaryRRE<"ivsk", 0xB223, null_frag, GR32, GR64>;
+
+// Set storage key extended.
+let hasSideEffects = 1, Defs = [CC] in
+  defm SSKE : SideEffectTernaryRRFcOpt<"sske", 0xB22B, GR32, GR64>;
+
+// Reset reference bit extended.
+let hasSideEffects = 1, Defs = [CC] in
+  def RRBE : SideEffectBinaryRRE<"rrbe", 0xB22A, GR32, GR64>;
+
+// Reset reference bits multiple.
+let Predicates = [FeatureResetReferenceBitsMultiple], hasSideEffects = 1 in
+  def RRBM : UnaryRRE<"rrbm", 0xB9AE, null_frag, GR64, GR64>;
+
+// Perform frame management function.
+let hasSideEffects = 1 in
+  def PFMF : SideEffectBinaryMemRRE<"pfmf", 0xB9AF, GR32, GR64>;
+
+// Test block.
+let hasSideEffects = 1, mayStore = 1, Uses = [R0D], Defs = [R0D, CC] in
+  def TB : SideEffectBinaryRRE<"tb", 0xB22C, GR64, GR64>;
+
+// Page in / out.
+let mayLoad = 1, mayStore = 1, Defs = [CC] in {
+  def PGIN : SideEffectBinaryRRE<"pgin", 0xB22E, GR64, GR64>;
+  def PGOUT : SideEffectBinaryRRE<"pgout", 0xB22F, GR64, GR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// Dynamic-Address-Translation Instructions.
+//===----------------------------------------------------------------------===//
+
+// Invalidate page table entry.
+let hasSideEffects = 1 in
+  defm IPTE : SideEffectQuaternaryRRFaOptOpt<"ipte", 0xB221, GR64, GR32, GR32>;
+
+// Invalidate DAT table entry.
+let hasSideEffects = 1 in
+  defm IDTE : SideEffectQuaternaryRRFbOpt<"idte", 0xB98E, GR64, GR64, GR64>;
+
+// Compare and replace DAT table entry.
+let Predicates = [FeatureEnhancedDAT2], hasSideEffects = 1, Defs = [CC] in
+  defm CRDTE : SideEffectQuaternaryRRFbOpt<"crdte", 0xB98F, GR128, GR128, GR64>;
+
+// Purge TLB.
+let hasSideEffects = 1 in
+  def PTLB : SideEffectInherentS<"ptlb", 0xB20D, null_frag>;
+
+// Compare and swap and purge.
+let hasSideEffects = 1, Defs = [CC] in {
+  def CSP : CmpSwapRRE<"csp", 0xB250, GR128, GR64>;
+  def CSPG : CmpSwapRRE<"cspg", 0xB98A, GR128, GR64>;
+}
+
+// Load page-table-entry address.
+let hasSideEffects = 1, Defs = [CC] in
+  def LPTEA : TernaryRRFb<"lptea", 0xB9AA, GR64, GR64, GR64>;
+
+// Load real address.
+let hasSideEffects = 1, Defs = [CC] in {
+  defm LRA : LoadAddressRXPair<"lra", 0xB1, 0xE313, null_frag>;
+  def LRAG : LoadAddressRXY<"lrag", 0xE303, null_frag, laaddr20pair>;
+}
+
+// Store real address.
+def STRAG : StoreSSE<"strag", 0xE502>;
+
+// Load using real address.
+let mayLoad = 1 in {
+ def LURA : UnaryRRE<"lura", 0xB24B, null_frag, GR32, GR64>;
+ def LURAG : UnaryRRE<"lurag", 0xB905, null_frag, GR64, GR64>;
+}
+
+// Store using real address.
+let mayStore = 1 in {
+ def STURA : SideEffectBinaryRRE<"stura", 0xB246, GR32, GR64>;
+ def STURG : SideEffectBinaryRRE<"sturg", 0xB925, GR64, GR64>;
+}
+
+// Test protection.
+let hasSideEffects = 1, Defs = [CC] in
+  def TPROT : SideEffectBinarySSE<"tprot", 0xE501>;
+
+//===----------------------------------------------------------------------===//
+// Memory-move Instructions.
+//===----------------------------------------------------------------------===//
+
+// Move with key.
+let mayLoad = 1, mayStore = 1, Defs = [CC] in
+  def MVCK : MemoryBinarySSd<"mvck", 0xD9, GR64>;
+
+// Move to primary / secondary.
+let mayLoad = 1, mayStore = 1, Defs = [CC] in {
+  def MVCP : MemoryBinarySSd<"mvcp", 0xDA, GR64>;
+  def MVCS : MemoryBinarySSd<"mvcs", 0xDB, GR64>;
+}
+
+// Move with source / destination key.
+let mayLoad = 1, mayStore = 1, Uses = [R0L, R1L] in {
+  def MVCSK : SideEffectBinarySSE<"mvcsk", 0xE50E>;
+  def MVCDK : SideEffectBinarySSE<"mvcdk", 0xE50F>;
+}
+
+// Move with optional specifications.
+let mayLoad = 1, mayStore = 1, Uses = [R0L] in
+  def MVCOS : SideEffectTernarySSF<"mvcos", 0xC80, GR64>;
+
+// Move page.
+let mayLoad = 1, mayStore = 1, Uses = [R0L], Defs = [CC] in
+  def MVPG : SideEffectBinaryRRE<"mvpg", 0xB254, GR64, GR64>;
+
+//===----------------------------------------------------------------------===//
+// Address-Space Instructions.
+//===----------------------------------------------------------------------===//
+
+// Load address space parameters.
+let hasSideEffects = 1, Defs = [CC] in
+  def LASP : SideEffectBinarySSE<"lasp", 0xE500>;
+
+// Purge ALB.
+let hasSideEffects = 1 in
+  def PALB : SideEffectInherentRRE<"palb", 0xB248>;
+
+// Program call.
+let hasSideEffects = 1 in
+  def PC : SideEffectAddressS<"pc", 0xB218, null_frag>;
+
+// Program return.
+let hasSideEffects = 1, Defs = [CC] in
+  def PR : SideEffectInherentE<"pr", 0x0101>;
+
+// Program transfer (with instance).
+let hasSideEffects = 1 in {
+  def PT : SideEffectBinaryRRE<"pt", 0xB228, GR32, GR64>;
+  def PTI : SideEffectBinaryRRE<"pti", 0xB99E, GR64, GR64>;
+}
+
+// Resume program.
+let hasSideEffects = 1, Defs = [CC] in
+  def RP : SideEffectAddressS<"rp", 0xB277, null_frag>;
+
+// Branch in subspace group.
+let hasSideEffects = 1 in
+  def BSG : UnaryRRE<"bsg", 0xB258, null_frag, GR64, GR64>;
+
+// Branch and set authority.
+let hasSideEffects = 1 in
+  def BSA : UnaryRRE<"bsa", 0xB25A, null_frag, GR64, GR64>;
+
+// Test access.
+let Defs = [CC] in
+  def TAR : SideEffectBinaryRRE<"tar", 0xB24C, AR32, GR32>;
+
+//===----------------------------------------------------------------------===//
+// Linkage-Stack Instructions.
+//===----------------------------------------------------------------------===//
+
+// Branch and stack.
+let hasSideEffects = 1 in
+  def BAKR : SideEffectBinaryRRE<"bakr", 0xB240, GR64, GR64>;
+
+// Extract stacked registers.
+let hasSideEffects = 1 in {
+  def EREG : SideEffectBinaryRRE<"ereg", 0xB249, GR32, GR32>;
+  def EREGG : SideEffectBinaryRRE<"eregg", 0xB90E, GR64, GR64>;
+}
+
+// Extract stacked state.
+let hasSideEffects = 1, Defs = [CC] in
+  def ESTA : UnaryRRE<"esta", 0xB24A, null_frag, GR128, GR32>;
+
+// Modify stacked state.
+let hasSideEffects = 1 in
+  def MSTA : SideEffectUnaryRRE<"msta", 0xB247, GR128, null_frag>;
+
+//===----------------------------------------------------------------------===//
+// Time-Related Instructions.
+//===----------------------------------------------------------------------===//
+
+// Perform timing facility function.
+let hasSideEffects = 1, mayLoad = 1, Uses = [R0L, R1D], Defs = [CC] in
+  def PTFF : SideEffectInherentE<"ptff", 0x0104>;
+
+// Set clock.
+let hasSideEffects = 1, Defs = [CC] in
+  def SCK : SideEffectUnaryS<"sck", 0xB204, null_frag, 8>;
+
+// Set clock programmable field.
+let hasSideEffects = 1, Uses = [R0L] in
+  def SCKPF : SideEffectInherentE<"sckpf", 0x0107>;
+
+// Set clock comparator.
+let hasSideEffects = 1 in
+  def SCKC : SideEffectUnaryS<"sckc", 0xB206, null_frag, 8>;
+
+// Set CPU timer.
+let hasSideEffects = 1 in
+  def SPT : SideEffectUnaryS<"spt", 0xB208, null_frag, 8>;
+
+// Store clock (fast / extended).
+let hasSideEffects = 1, Defs = [CC] in {
+  def STCK  : StoreInherentS<"stck",  0xB205, null_frag, 8>;
+  def STCKF : StoreInherentS<"stckf", 0xB27C, null_frag, 8>;
+  def STCKE : StoreInherentS<"stcke", 0xB278, null_frag, 16>;
+}
+
+// Store clock comparator.
+let hasSideEffects = 1 in
+  def STCKC : StoreInherentS<"stckc", 0xB207, null_frag, 8>;
+
+// Store CPU timer.
+let hasSideEffects = 1 in
+  def STPT : StoreInherentS<"stpt", 0xB209, null_frag, 8>;
+
+//===----------------------------------------------------------------------===//
+// CPU-Related Instructions.
+//===----------------------------------------------------------------------===//
+
+// Store CPU address.
+let hasSideEffects = 1 in
+  def STAP : StoreInherentS<"stap", 0xB212, null_frag, 2>;
+
+// Store CPU ID.
+let hasSideEffects = 1 in
+  def STIDP : StoreInherentS<"stidp", 0xB202, null_frag, 8>;
+
+// Store system information.
+let hasSideEffects = 1, Uses = [R0L, R1L], Defs = [R0L, CC] in
+  def STSI : StoreInherentS<"stsi", 0xB27D, null_frag, 0>;
+
+// Store facility list.
+let hasSideEffects = 1 in
+  def STFL : StoreInherentS<"stfl", 0xB2B1, null_frag, 4>;
+
+// Store facility list extended.
+let hasSideEffects = 1, Uses = [R0D], Defs = [R0D, CC] in
+  def STFLE : StoreInherentS<"stfle", 0xB2B0, null_frag, 0>;
+
+// Extract CPU attribute.
+let hasSideEffects = 1 in
+  def ECAG : BinaryRSY<"ecag", 0xEB4C, null_frag, GR64>;
+
+// Extract CPU time.
+let hasSideEffects = 1, mayLoad = 1, Defs = [R0D, R1D] in
+  def ECTG : SideEffectTernarySSF<"ectg", 0xC81, GR64>;
+
+// Perform topology function.
+let hasSideEffects = 1 in
+  def PTF : UnaryTiedRRE<"ptf", 0xB9A2, GR64>;
+
+// Perform cryptographic key management operation.
+let Predicates = [FeatureMessageSecurityAssist3],
+    hasSideEffects = 1, Uses = [R0L, R1D] in
+  def PCKMO : SideEffectInherentRRE<"pckmo", 0xB928>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Supervisor call.
+let hasSideEffects = 1, isCall = 1, Defs = [CC] in
+  def SVC : SideEffectUnaryI<"svc", 0x0A, imm32zx8>;
+
+// Monitor call.
+let hasSideEffects = 1, isCall = 1 in
+  def MC : SideEffectBinarySI<"mc", 0xAF, imm32zx8>;
+
+// Diagnose.
+let hasSideEffects = 1, isCall = 1 in
+  def DIAG : SideEffectTernaryRS<"diag", 0x83, GR32, GR32>;
+
+// Trace.
+let hasSideEffects = 1, mayLoad = 1 in {
+  def TRACE : SideEffectTernaryRS<"trace", 0x99, GR32, GR32>;
+  def TRACG : SideEffectTernaryRSY<"tracg", 0xEB0F, GR64, GR64>;
+}
+
+// Trap.
+let hasSideEffects = 1 in {
+  def TRAP2 : SideEffectInherentE<"trap2", 0x01FF>;
+  def TRAP4 : SideEffectAddressS<"trap4", 0xB2FF, null_frag>;
+}
+
+// Signal processor.
+let hasSideEffects = 1, Defs = [CC] in
+  def SIGP : SideEffectTernaryRS<"sigp", 0xAE, GR64, GR64>;
+
+// Signal adapter.
+let hasSideEffects = 1, Uses = [R0D, R1D, R2D, R3D], Defs = [CC] in
+  def SIGA : SideEffectAddressS<"siga", 0xB274, null_frag>;
+
+// Start interpretive execution.
+let hasSideEffects = 1, Defs = [CC] in
+  def SIE : SideEffectUnaryS<"sie", 0xB214, null_frag, 0>;
+
+//===----------------------------------------------------------------------===//
+// CPU-Measurement Facility Instructions (SA23-2260).
+//===----------------------------------------------------------------------===//
+
+// Load program parameter
+let hasSideEffects = 1 in
+  def LPP : SideEffectUnaryS<"lpp", 0xB280, null_frag, 8>;
+
+// Extract coprocessor-group address.
+let hasSideEffects = 1, Defs = [CC] in
+  def ECPGA : UnaryRRE<"ecpga", 0xB2ED, null_frag, GR32, GR64>;
+
+// Extract CPU counter.
+let hasSideEffects = 1, Defs = [CC] in
+  def ECCTR : UnaryRRE<"ecctr", 0xB2E4, null_frag, GR64, GR64>;
+
+// Extract peripheral counter.
+let hasSideEffects = 1, Defs = [CC] in
+  def EPCTR : UnaryRRE<"epctr", 0xB2E5, null_frag, GR64, GR64>;
+
+// Load CPU-counter-set controls.
+let hasSideEffects = 1, Defs = [CC] in
+  def LCCTL : SideEffectUnaryS<"lcctl", 0xB284, null_frag, 8>;
+
+// Load peripheral-counter-set controls.
+let hasSideEffects = 1, Defs = [CC] in
+  def LPCTL : SideEffectUnaryS<"lpctl", 0xB285, null_frag, 8>;
+
+// Load sampling controls.
+let hasSideEffects = 1, Defs = [CC] in
+  def LSCTL : SideEffectUnaryS<"lsctl", 0xB287, null_frag, 0>;
+
+// Query sampling information.
+let hasSideEffects = 1 in
+  def QSI : StoreInherentS<"qsi", 0xB286, null_frag, 0>;
+
+// Query counter information.
+let hasSideEffects = 1 in
+  def QCTRI : StoreInherentS<"qctri", 0xB28E, null_frag, 0>;
+
+// Set CPU counter.
+let hasSideEffects = 1, Defs = [CC] in
+  def SCCTR : SideEffectBinaryRRE<"scctr", 0xB2E0, GR64, GR64>;
+
+// Set peripheral counter.
+let hasSideEffects = 1, Defs = [CC] in
+  def SPCTR : SideEffectBinaryRRE<"spctr", 0xB2E1, GR64, GR64>;
+
+//===----------------------------------------------------------------------===//
+// I/O Instructions (Principles of Operation, Chapter 14).
+//===----------------------------------------------------------------------===//
+
+// Clear subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def CSCH : SideEffectInherentS<"csch", 0xB230, null_frag>;
+
+// Halt subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def HSCH : SideEffectInherentS<"hsch", 0xB231, null_frag>;
+
+// Modify subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def MSCH : SideEffectUnaryS<"msch", 0xB232, null_frag, 0>;
+
+// Resume subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def RSCH : SideEffectInherentS<"rsch", 0xB238, null_frag>;
+
+// Start subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def SSCH : SideEffectUnaryS<"ssch", 0xB233, null_frag, 0>;
+
+// Store subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def STSCH : StoreInherentS<"stsch", 0xB234, null_frag, 0>;
+
+// Test subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def TSCH : StoreInherentS<"tsch", 0xB235, null_frag, 0>;
+
+// Cancel subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def XSCH : SideEffectInherentS<"xsch", 0xB276, null_frag>;
+
+// Reset channel path.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def RCHP : SideEffectInherentS<"rchp", 0xB23B, null_frag>;
+
+// Set channel monitor.
+let hasSideEffects = 1, mayLoad = 1, Uses = [R1L, R2D] in
+  def SCHM : SideEffectInherentS<"schm", 0xB23C, null_frag>;
+
+// Store channel path status.
+let hasSideEffects = 1 in
+  def STCPS : StoreInherentS<"stcps", 0xB23A, null_frag, 0>;
+
+// Store channel report word.
+let hasSideEffects = 1, Defs = [CC] in
+  def STCRW : StoreInherentS<"stcrw", 0xB239, null_frag, 0>;
+
+// Test pending interruption.
+let hasSideEffects = 1, Defs = [CC] in
+  def TPI : StoreInherentS<"tpi", 0xB236, null_frag, 0>;
+
+// Set address limit.
+let hasSideEffects = 1, Uses = [R1L] in
+  def SAL : SideEffectInherentS<"sal", 0xB237, null_frag>;
+
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td
index 47d2f75cc11a..36809ea81dc1 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -304,3 +304,13 @@ foreach I = 0-15 in {
 defm AR32 : SystemZRegClass<"AR32", [i32], 32,
                             (add (sequence "A%u", 0, 15)), 0>;
 
+// Control registers.
+class CREG64<bits<16> num, string n> : SystemZReg<n> {
+  let HWEncoding = num;
+}
+foreach I = 0-15 in {
+  def C#I : CREG64<I, "c"#I>, DwarfRegNum<[!add(I, 32)]>;
+}
+defm CR64 : SystemZRegClass<"CR64", [i64], 64,
+                            (add (sequence "C%u", 0, 15)), 0>;
+
diff --git a/lib/Target/SystemZ/SystemZScheduleZ13.td b/lib/Target/SystemZ/SystemZScheduleZ13.td
index 5f5f2f690e58..adc9f2976f87 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -353,6 +353,9 @@ def : InstRW<[FXa], (instregex "ALGF(I|R)$")>;
 def : InstRW<[FXa], (instregex "ALGR(K)?$")>;
 def : InstRW<[FXa], (instregex "ALR(K)?$")>;
 def : InstRW<[FXa], (instregex "AR(K)?$")>;
+def : InstRW<[FXa], (instregex "A(L)?HHHR$")>;
+def : InstRW<[FXa, Lat2], (instregex "A(L)?HHLR$")>;
+def : InstRW<[FXa], (instregex "ALSIH(N)?$")>;
 def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>;
 
 // Logical addition with carry
@@ -376,6 +379,8 @@ def : InstRW<[FXa], (instregex "SLGF(I|R)$")>;
 def : InstRW<[FXa], (instregex "SLGR(K)?$")>;
 def : InstRW<[FXa], (instregex "SLR(K)?$")>;
 def : InstRW<[FXa], (instregex "SR(K)?$")>;
+def : InstRW<[FXa], (instregex "S(L)?HHHR$")>;
+def : InstRW<[FXa, Lat2], (instregex "S(L)?HHLR$")>;
 
 // Subtraction with borrow
 def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "SLB(G)?$")>;
@@ -506,6 +511,8 @@ def : InstRW<[FXb], (instregex "CLIH$")>;
 def : InstRW<[FXb, LSU, Lat5], (instregex "CLI(Y)?$")>;
 def : InstRW<[FXb], (instregex "CLR$")>;
 def : InstRW<[FXb, LSU, Lat5], (instregex "CLRL$")>;
+def : InstRW<[FXb], (instregex "C(L)?HHR$")>;
+def : InstRW<[FXb, Lat2], (instregex "C(L)?HLR$")>;
 
 // Compare halfword
 def : InstRW<[FXb, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
@@ -701,38 +708,9 @@ def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
 
-// Move with key
-def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
-
-// Monitor call
-def : InstRW<[FXb], (instregex "MC$")>;
-
-// Extract CPU attribute
-def : InstRW<[FXb, Lat30], (instregex "ECAG$")>;
-
-// Extract CPU Time
-def : InstRW<[FXa, Lat5, LSU], (instregex "ECTG$")>;
-
-// Extract PSW
-def : InstRW<[FXb, Lat30], (instregex "EPSW$")>;
-
 // Execute
 def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>;
 
-// Program return
-def : InstRW<[FXb, Lat30], (instregex "PR$")>;
-
-// Inline assembly
-def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone],
-             (instregex "STCK(F)?$")>;
-def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone],
-             (instregex "STCKE$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "STFLE$")>;
-def : InstRW<[FXb, Lat30], (instregex "SVC$")>;
-
-// Store real address
-def : InstRW<[FXb, LSU, Lat5], (instregex "STRAG$")>;
-
 //===----------------------------------------------------------------------===//
 // .insn directive instructions
 //===----------------------------------------------------------------------===//
@@ -1364,5 +1342,162 @@ def : InstRW<[VecStr, Lat5], (instregex "VSTRC(B|F|H)S$")>;
 def : InstRW<[VecStr], (instregex "VSTRCZ(B|F|H)$")>;
 def : InstRW<[VecStr, Lat5], (instregex "VSTRCZ(B|F|H)S$")>;
 
+
+// -------------------------------- System ---------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// System: Program-Status Word Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "EPSW$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LPSW(E)?$")>;
+def : InstRW<[FXa, Lat3], (instregex "IPK$")>;
+def : InstRW<[LSU], (instregex "SPKA$")>;
+def : InstRW<[LSU], (instregex "SSM$")>;
+def : InstRW<[FXb], (instregex "ST(N|O)SM$")>;
+def : InstRW<[FXa, Lat3], (instregex "IAC$")>;
+def : InstRW<[LSU], (instregex "SAC(F)?$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Control Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "LCTL(G)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[FXb, Lat30], (instregex "SSA(I)?R$")>;
+def : InstRW<[FXb, Lat30], (instregex "ESEA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Prefix-Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "SPX$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STPX$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Storage-Key and Real Memory Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "ISKE$")>;
+def : InstRW<[FXb, Lat30], (instregex "IVSK$")>;
+def : InstRW<[FXb, Lat30], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "RRB(E|M)$")>;
+def : InstRW<[FXb, Lat30], (instregex "PFMF$")>;
+def : InstRW<[FXb, Lat30], (instregex "TB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PGIN$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PGOUT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Dynamic-Address-Translation Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "PTLB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "CSP(G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LPTEA$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STRAG$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LURA(G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STUR(A|G)$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "TPROT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Memory-move Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "MVCOS$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "MVPG$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Address-Space Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "LASP$")>;
+def : InstRW<[LSU], (instregex "PALB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PC$")>;
+def : InstRW<[FXb, Lat30], (instregex "PR$")>;
+def : InstRW<[FXb, Lat30], (instregex "PT(I)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "RP$")>;
+def : InstRW<[FXb, Lat30], (instregex "BS(G|A)$")>;
+def : InstRW<[FXb, Lat20], (instregex "TAR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Linkage-Stack Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "BAKR$")>;
+def : InstRW<[FXb, Lat30], (instregex "EREG(G)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "(E|M)STA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Time-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "PTFF$")>;
+def : InstRW<[FXb, LSU, Lat20], (instregex "SCK$")>;
+def : InstRW<[FXb, Lat30], (instregex "SCKPF$")>;
+def : InstRW<[FXb, LSU, Lat20], (instregex "SCKC$")>;
+def : InstRW<[LSU, GroupAlone], (instregex "SPT$")>;
+def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone],
+             (instregex "STCK(F)?$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone],
+             (instregex "STCKE$")>;
+def : InstRW<[FXb, LSU, Lat9], (instregex "STCKC$")>;
+def : InstRW<[LSU, LSU, FXb, Lat3], (instregex "STPT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "STAP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STIDP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STSI$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STFL(E)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "ECAG$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "ECTG$")>;
+def : InstRW<[FXb, Lat30], (instregex "PTF$")>;
+def : InstRW<[FXb, Lat30], (instregex "PCKMO$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Miscellaneous Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "SVC$")>;
+def : InstRW<[FXb], (instregex "MC$")>;
+def : InstRW<[FXb, Lat30], (instregex "DIAG$")>;
+def : InstRW<[FXb], (instregex "TRAC(E|G)$")>;
+def : InstRW<[FXb, Lat30], (instregex "TRAP(2|4)$")>;
+def : InstRW<[FXb, Lat30], (instregex "SIGP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "SIGA$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "SIE$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Measurement Facility Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "LPP$")>;
+def : InstRW<[FXb, Lat30], (instregex "ECPGA$")>;
+def : InstRW<[FXb, Lat30], (instregex "E(C|P)CTR$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "L(C|P|S)CTL$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[FXb, Lat30], (instregex "S(C|P)CTR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: I/O Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[FXb, Lat30], (instregex "RCHP$")>;
+def : InstRW<[FXb, Lat30], (instregex "SCHM$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STC(PS|RW)$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "TPI$")>;
+def : InstRW<[FXb, Lat30], (instregex "SAL$")>;
+
 }
 
diff --git a/lib/Target/SystemZ/SystemZScheduleZ196.td b/lib/Target/SystemZ/SystemZScheduleZ196.td
index 126eac2e2072..128049a09086 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -310,6 +310,9 @@ def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
 def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
 def : InstRW<[FXU], (instregex "ALR(K)?$")>;
 def : InstRW<[FXU], (instregex "AR(K)?$")>;
+def : InstRW<[FXU], (instregex "A(L)?HHHR$")>;
+def : InstRW<[FXU, FXU, Lat3], (instregex "A(L)?HHLR$")>;
+def : InstRW<[FXU], (instregex "ALSIH(N)?$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
 
 // Logical addition with carry
@@ -333,6 +336,8 @@ def : InstRW<[FXU], (instregex "SLGF(I|R)$")>;
 def : InstRW<[FXU], (instregex "SLGR(K)?$")>;
 def : InstRW<[FXU], (instregex "SLR(K)?$")>;
 def : InstRW<[FXU], (instregex "SR(K)?$")>;
+def : InstRW<[FXU], (instregex "S(L)?HHHR$")>;
+def : InstRW<[FXU, FXU, Lat3], (instregex "S(L)?HHLR$")>;
 
 // Subtraction with borrow
 def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>;
@@ -468,6 +473,8 @@ def : InstRW<[FXU], (instregex "CLIH$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>;
 def : InstRW<[FXU], (instregex "CLR$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>;
+def : InstRW<[FXU], (instregex "C(L)?HHR$")>;
+def : InstRW<[FXU, FXU, Lat3], (instregex "C(L)?HLR$")>;
 
 // Compare halfword
 def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CH(Y|RL)?$")>;
@@ -634,37 +641,9 @@ def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
 
-// Move with key
-def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
-
-// Monitor call
-def : InstRW<[FXU], (instregex "MC$")>;
-
-// Extract CPU attribute
-def : InstRW<[FXU, Lat30], (instregex "ECAG$")>;
-
-// Extract CPU Time
-def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>;
-
-// Extract PSW
-def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
-
 // Execute
 def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
 
-// Program return
-def : InstRW<[FXU, Lat30], (instregex "PR$")>;
-
-// Inline assembly
-def : InstRW<[FXU, LSU, Lat15], (instregex "STCK$")>;
-def : InstRW<[FXU, LSU, Lat12], (instregex "STCKF$")>;
-def : InstRW<[LSU, FXU, Lat5], (instregex "STCKE$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STFLE$")>;
-def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
-
-// Store real address
-def : InstRW<[FXU, LSU, Lat5], (instregex "STRAG$")>;
-
 //===----------------------------------------------------------------------===//
 // .insn directive instructions
 //===----------------------------------------------------------------------===//
@@ -1058,5 +1037,160 @@ def : InstRW<[DFU, Lat9], (instregex "CEXTR$")>;
 def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>;
 def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
 
+
+// -------------------------------- System ---------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// System: Program-Status Word Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>;
+def : InstRW<[FXU, Lat3], (instregex "IPK$")>;
+def : InstRW<[LSU], (instregex "SPKA$")>;
+def : InstRW<[LSU], (instregex "SSM$")>;
+def : InstRW<[FXU], (instregex "ST(N|O)SM$")>;
+def : InstRW<[FXU, Lat3], (instregex "IAC$")>;
+def : InstRW<[LSU], (instregex "SAC(F)?$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Control Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>;
+def : InstRW<[FXU, Lat30], (instregex "ESEA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Prefix-Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "SPX$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STPX$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Storage-Key and Real Memory Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "ISKE$")>;
+def : InstRW<[FXU, Lat30], (instregex "IVSK$")>;
+def : InstRW<[FXU, Lat30], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "RRB(E|M)$")>;
+def : InstRW<[FXU, Lat30], (instregex "PFMF$")>;
+def : InstRW<[FXU, Lat30], (instregex "TB$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "PGIN$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "PGOUT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Dynamic-Address-Translation Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "PTLB$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "CSP(G)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LPTEA$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STRAG$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LURA(G)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STUR(A|G)$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Memory-move Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>;
+def : InstRW<[LSU, Lat30], (instregex "MVPG$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Address-Space Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>;
+def : InstRW<[LSU], (instregex "PALB$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>;
+def : InstRW<[FXU, Lat30], (instregex "PR$")>;
+def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "RP$")>;
+def : InstRW<[FXU, Lat30], (instregex "BS(G|A)$")>;
+def : InstRW<[FXU, Lat20], (instregex "TAR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Linkage-Stack Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "BAKR$")>;
+def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Time-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "PTFF$")>;
+def : InstRW<[FXU, LSU, Lat20], (instregex "SCK$")>;
+def : InstRW<[FXU, Lat30], (instregex "SCKPF$")>;
+def : InstRW<[FXU, LSU, Lat20], (instregex "SCKC$")>;
+def : InstRW<[FXU, LSU, Lat20], (instregex "SPT$")>;
+def : InstRW<[FXU, LSU, Lat15], (instregex "STCK$")>;
+def : InstRW<[FXU, LSU, Lat12], (instregex "STCKF$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STCKE$")>;
+def : InstRW<[FXU, LSU, Lat9], (instregex "STCKC$")>;
+def : InstRW<[FXU, LSU, Lat8], (instregex "STPT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "STAP$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STIDP$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STSI$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STFL(E)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "ECAG$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "ECTG$")>;
+def : InstRW<[FXU, Lat30], (instregex "PTF$")>;
+def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Miscellaneous Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
+def : InstRW<[FXU], (instregex "MC$")>;
+def : InstRW<[FXU, Lat30], (instregex "DIAG$")>;
+def : InstRW<[FXU], (instregex "TRAC(E|G)$")>;
+def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>;
+def : InstRW<[FXU, Lat30], (instregex "SIGP$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "SIGA$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Measurement Facility Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "LPP$")>;
+def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>;
+def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "L(C|P|S)CTL$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: I/O Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[FXU, Lat30], (instregex "RCHP$")>;
+def : InstRW<[FXU, Lat30], (instregex "SCHM$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STC(PS|RW)$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TPI$")>;
+def : InstRW<[FXU, Lat30], (instregex "SAL$")>;
+
 }
 
diff --git a/lib/Target/SystemZ/SystemZScheduleZEC12.td b/lib/Target/SystemZ/SystemZScheduleZEC12.td
index d38ca64d2e9b..76b378454631 100644
--- a/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -320,6 +320,9 @@ def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
 def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
 def : InstRW<[FXU], (instregex "ALR(K)?$")>;
 def : InstRW<[FXU], (instregex "AR(K)?$")>;
+def : InstRW<[FXU], (instregex "A(L)?HHHR$")>;
+def : InstRW<[FXU, Lat2], (instregex "A(L)?HHLR$")>;
+def : InstRW<[FXU], (instregex "ALSIH(N)?$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
 
 // Logical addition with carry
@@ -343,6 +346,8 @@ def : InstRW<[FXU], (instregex "SLGF(I|R)$")>;
 def : InstRW<[FXU], (instregex "SLGR(K)?$")>;
 def : InstRW<[FXU], (instregex "SLR(K)?$")>;
 def : InstRW<[FXU], (instregex "SR(K)?$")>;
+def : InstRW<[FXU], (instregex "S(L)?HHHR$")>;
+def : InstRW<[FXU, Lat2], (instregex "S(L)?HHLR$")>;
 
 // Subtraction with borrow
 def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>;
@@ -478,6 +483,8 @@ def : InstRW<[FXU], (instregex "CLIH$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>;
 def : InstRW<[FXU], (instregex "CLR$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>;
+def : InstRW<[FXU], (instregex "C(L)?HHR$")>;
+def : InstRW<[FXU, Lat2], (instregex "C(L)?HLR$")>;
 
 // Compare halfword
 def : InstRW<[FXU, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
@@ -672,37 +679,9 @@ def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>;
 
-// Move with key
-def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
-
-// Monitor call
-def : InstRW<[FXU], (instregex "MC$")>;
-
-// Extract CPU attribute
-def : InstRW<[FXU, Lat30], (instregex "ECAG$")>;
-
-// Extract CPU Time
-def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>;
-
-// Extract PSW
-def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
-
 // Execute
 def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
 
-// Program return
-def : InstRW<[FXU, Lat30], (instregex "PR$")>;
-
-// Inline assembly
-def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "STCK(F)?$")>;
-def : InstRW<[LSU, LSU, LSU, LSU, FXU, FXU, Lat20, GroupAlone],
-             (instregex "STCKE$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STFLE$")>;
-def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
-
-// Store real address
-def : InstRW<[FXU, LSU, Lat5], (instregex "STRAG$")>;
-
 //===----------------------------------------------------------------------===//
 // .insn directive instructions
 //===----------------------------------------------------------------------===//
@@ -1102,5 +1081,161 @@ def : InstRW<[DFU, Lat9], (instregex "CEXTR$")>;
 def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>;
 def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
 
+
+// -------------------------------- System ---------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// System: Program-Status Word Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>;
+def : InstRW<[FXU, Lat3], (instregex "IPK$")>;
+def : InstRW<[LSU], (instregex "SPKA$")>;
+def : InstRW<[LSU], (instregex "SSM$")>;
+def : InstRW<[FXU], (instregex "ST(N|O)SM$")>;
+def : InstRW<[FXU, Lat3], (instregex "IAC$")>;
+def : InstRW<[LSU], (instregex "SAC(F)?$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Control Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>;
+def : InstRW<[FXU, Lat30], (instregex "ESEA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Prefix-Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "SPX$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STPX$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Storage-Key and Real Memory Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "ISKE$")>;
+def : InstRW<[FXU, Lat30], (instregex "IVSK$")>;
+def : InstRW<[FXU, Lat30], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "RRB(E|M)$")>;
+def : InstRW<[FXU, Lat30], (instregex "PFMF$")>;
+def : InstRW<[FXU, Lat30], (instregex "TB$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "PGIN$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "PGOUT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Dynamic-Address-Translation Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "PTLB$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "CSP(G)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LPTEA$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STRAG$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LURA(G)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STUR(A|G)$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Memory-move Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>;
+def : InstRW<[LSU, Lat30], (instregex "MVPG$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Address-Space Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>;
+def : InstRW<[LSU], (instregex "PALB$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>;
+def : InstRW<[FXU, Lat30], (instregex "PR$")>;
+def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "RP$")>;
+def : InstRW<[FXU, Lat30], (instregex "BS(G|A)$")>;
+def : InstRW<[FXU, Lat20], (instregex "TAR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Linkage-Stack Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "BAKR$")>;
+def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Time-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "PTFF$")>;
+def : InstRW<[FXU, LSU, Lat20], (instregex "SCK$")>;
+def : InstRW<[FXU, Lat30], (instregex "SCKPF$")>;
+def : InstRW<[FXU, LSU, Lat20], (instregex "SCKC$")>;
+def : InstRW<[FXU, LSU, Lat20], (instregex "SPT$")>;
+def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "STCK(F)?$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, FXU, FXU, Lat20, GroupAlone],
+             (instregex "STCKE$")>;
+def : InstRW<[FXU, LSU, Lat9], (instregex "STCKC$")>;
+def : InstRW<[FXU, LSU, Lat8], (instregex "STPT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "STAP$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STIDP$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STSI$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STFL(E)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "ECAG$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "ECTG$")>;
+def : InstRW<[FXU, Lat30], (instregex "PTF$")>;
+def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Miscellaneous Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
+def : InstRW<[FXU], (instregex "MC$")>;
+def : InstRW<[FXU, Lat30], (instregex "DIAG$")>;
+def : InstRW<[FXU], (instregex "TRAC(E|G)$")>;
+def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>;
+def : InstRW<[FXU, Lat30], (instregex "SIGP$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "SIGA$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Measurement Facility Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "LPP$")>;
+def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>;
+def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "L(C|P|S)CTL$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: I/O Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[FXU, Lat30], (instregex "RCHP$")>;
+def : InstRW<[FXU, Lat30], (instregex "SCHM$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STC(PS|RW)$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TPI$")>;
+def : InstRW<[FXU, Lat30], (instregex "SAL$")>;
+
 }
 
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index 0ab0c2f25915..eb4a0962f7eb 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -37,12 +37,13 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
                                    const TargetMachine &TM)
     : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
       HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
-      HasPopulationCount(false), HasMessageSecurityAssist4(false),
+      HasPopulationCount(false), HasMessageSecurityAssist3(false),
+      HasMessageSecurityAssist4(false), HasResetReferenceBitsMultiple(false),
       HasFastSerialization(false), HasInterlockedAccess1(false),
       HasMiscellaneousExtensions(false),
       HasExecutionHint(false), HasLoadAndTrap(false),
       HasTransactionalExecution(false), HasProcessorAssist(false),
-      HasDFPZonedConversion(false),
+      HasDFPZonedConversion(false), HasEnhancedDAT2(false),
       HasVector(false), HasLoadStoreOnCond2(false),
       HasLoadAndZeroRightmostByte(false), HasMessageSecurityAssist5(false),
       HasDFPPackedConversion(false),
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index be480f03c572..b05a1bb6cafd 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -39,7 +39,9 @@ protected:
   bool HasHighWord;
   bool HasFPExtension;
   bool HasPopulationCount;
+  bool HasMessageSecurityAssist3;
   bool HasMessageSecurityAssist4;
+  bool HasResetReferenceBitsMultiple;
   bool HasFastSerialization;
   bool HasInterlockedAccess1;
   bool HasMiscellaneousExtensions;
@@ -48,6 +50,7 @@ protected:
   bool HasTransactionalExecution;
   bool HasProcessorAssist;
   bool HasDFPZonedConversion;
+  bool HasEnhancedDAT2;
   bool HasVector;
   bool HasLoadStoreOnCond2;
   bool HasLoadAndZeroRightmostByte;
@@ -109,9 +112,18 @@ public:
   bool hasPopulationCount() const { return HasPopulationCount; }
 
   // Return true if the target has the message-security-assist
+  // extension facility 3.
+  bool hasMessageSecurityAssist3() const { return HasMessageSecurityAssist3; }
+
+  // Return true if the target has the message-security-assist
   // extension facility 4.
   bool hasMessageSecurityAssist4() const { return HasMessageSecurityAssist4; }
 
+  // Return true if the target has the reset-reference-bits-multiple facility.
+  bool hasResetReferenceBitsMultiple() const {
+    return HasResetReferenceBitsMultiple;
+  }
+
   // Return true if the target has the fast-serialization facility.
   bool hasFastSerialization() const { return HasFastSerialization; }
 
@@ -138,6 +150,9 @@ public:
   // Return true if the target has the DFP zoned-conversion facility.
   bool hasDFPZonedConversion() const { return HasDFPZonedConversion; }
 
+  // Return true if the target has the enhanced-DAT facility 2.
+  bool hasEnhancedDAT2() const { return HasEnhancedDAT2; }
+
   // Return true if the target has the load-and-zero-rightmost-byte facility.
   bool hasLoadAndZeroRightmostByte() const {
     return HasLoadAndZeroRightmostByte;
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 422c16b8eb62..ce5c57e0f519 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -238,7 +238,7 @@ SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) {
   return TTI::PSK_Software;
 }
 
-void SystemZTTIImpl::getUnrollingPreferences(Loop *L,
+void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                              TTI::UnrollingPreferences &UP) {
   // Find out if L contains a call, what the machine instruction count
   // estimate is, and how many stores there are.
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index bdba7601eb78..6923fc6fc910 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -45,7 +45,8 @@ public:
 
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
 
-  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
 
   /// @}
 
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 39cb1ca336f2..129794171464 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -57,17 +57,19 @@ def BR_TABLE_I64 : I<(outs), (ins I64:$index, variable_ops),
 }
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 
-// Placemarkers to indicate the start or end of a block or loop scope. These
-// use/clobber VALUE_STACK to prevent them from being moved into the middle of
-// an expression tree.
+// Placemarkers to indicate the start or end of a block, loop, or try scope.
+// These use/clobber VALUE_STACK to prevent them from being moved into the
+// middle of an expression tree.
 let Uses = [VALUE_STACK], Defs = [VALUE_STACK] in {
 def BLOCK     : I<(outs), (ins Signature:$sig), [], "block   \t$sig", 0x02>;
 def LOOP      : I<(outs), (ins Signature:$sig), [], "loop    \t$sig", 0x03>;
+def TRY       : I<(outs), (ins Signature:$sig), [], "try     \t$sig", 0x06>;
 
-// END_BLOCK, END_LOOP, and END_FUNCTION are represented with the same opcode
-// in wasm.
+// END_BLOCK, END_LOOP, END_TRY, and END_FUNCTION are represented with the same
+// opcode in wasm.
 def END_BLOCK : I<(outs), (ins), [], "end_block", 0x0b>;
 def END_LOOP  : I<(outs), (ins), [], "end_loop", 0x0b>;
+def END_TRY   : I<(outs), (ins), [], "end_try", 0x0b>;
 let isTerminator = 1, isBarrier = 1 in
 def END_FUNCTION : I<(outs), (ins), [], "end_function", 0x0b>;
 } // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
@@ -112,6 +114,20 @@ let isReturn = 1 in {
 
 def UNREACHABLE : I<(outs), (ins), [(trap)], "unreachable", 0x00>;
 
+def THROW_I32 : I<(outs), (ins i32imm:$tag, I32:$obj),
+                  [(int_wasm_throw imm:$tag, I32:$obj)], "throw   \t$tag, $obj",
+                  0x08>;
+def THROW_I64 : I<(outs), (ins i32imm:$tag, I64:$obj),
+                  [(int_wasm_throw imm:$tag, I64:$obj)], "throw   \t$tag, $obj",
+                  0x08>;
+def RETHROW : I<(outs), (ins i32imm:$rel_depth), [], "rethrow \t$rel_depth",
+                0x09>;
+
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 
 } // Defs = [ARGUMENTS]
+
+// rethrow takes a relative depth as an argument, for which currently only 0 is
+// possible for C++. Once other languages need depths other than 0, depths will
+// be computed in CFGStackify.
+def : Pat<(int_wasm_rethrow), (RETHROW 0)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 947c0329bb6e..f0b6a3e35dba 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -897,7 +897,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
     }
   }
 
-  // Look for orphan landingpads, can occur in blocks with no predecesors
+  // Look for orphan landingpads, can occur in blocks with no predecessors
   for (BasicBlock &BB : F) {
     Instruction *I = BB.getFirstNonPHI();
     if (auto *LPI = dyn_cast<LandingPadInst>(I))
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index d30cc724c203..825f23dc52d9 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -49,8 +49,11 @@ static const char OpPrecedence[] = {
   4, // IC_MINUS
   5, // IC_MULTIPLY
   5, // IC_DIVIDE
-  6, // IC_RPAREN
-  7, // IC_LPAREN
+  5, // IC_MOD
+  6, // IC_NOT
+  7, // IC_NEG
+  8, // IC_RPAREN
+  9, // IC_LPAREN
   0, // IC_IMM
   0  // IC_REGISTER
 };
@@ -92,6 +95,9 @@ private:
     IC_MINUS,
     IC_MULTIPLY,
     IC_DIVIDE,
+    IC_MOD,
+    IC_NOT,
+    IC_NEG,
     IC_RPAREN,
     IC_LPAREN,
     IC_IMM,
@@ -111,6 +117,10 @@ private:
     SmallVector<InfixCalculatorTok, 4> InfixOperatorStack;
     SmallVector<ICToken, 4> PostfixStack;
 
+    bool isUnaryOperator(const InfixCalculatorTok Op) {
+      return Op == IC_NEG || Op == IC_NOT;
+    }
+
   public:
     int64_t popOperand() {
       assert (!PostfixStack.empty() && "Poped an empty stack!");
@@ -192,6 +202,22 @@ private:
         ICToken Op = PostfixStack[i];
         if (Op.first == IC_IMM || Op.first == IC_REGISTER) {
           OperandStack.push_back(Op);
+        } else if (isUnaryOperator(Op.first)) {
+          assert (OperandStack.size() > 0 && "Too few operands.");
+          ICToken Operand = OperandStack.pop_back_val();
+          assert (Operand.first == IC_IMM &&
+                  "Unary operation with a register!");
+          switch (Op.first) {
+          default:
+            report_fatal_error("Unexpected operator!");
+            break;
+          case IC_NEG:
+            OperandStack.push_back(std::make_pair(IC_IMM, -Operand.second));
+            break;
+          case IC_NOT:
+            OperandStack.push_back(std::make_pair(IC_IMM, ~Operand.second));
+            break;
+          }
         } else {
           assert (OperandStack.size() > 1 && "Too few operands.");
           int64_t Val;
@@ -222,6 +248,12 @@ private:
             Val = Op1.second / Op2.second;
             OperandStack.push_back(std::make_pair(IC_IMM, Val));
             break;
+          case IC_MOD:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Modulo operation with an immediate and a register!");
+            Val = Op1.second % Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
           case IC_OR:
             assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
                     "Or operation with an immediate and a register!");
@@ -271,6 +303,7 @@ private:
     IES_NOT,
     IES_MULTIPLY,
     IES_DIVIDE,
+    IES_MOD,
     IES_LBRAC,
     IES_RBRAC,
     IES_LPAREN,
@@ -421,10 +454,16 @@ private:
       default:
         State = IES_ERROR;
         break;
+      case IES_OR:
+      case IES_XOR:
+      case IES_AND:
+      case IES_LSHIFT:
+      case IES_RSHIFT:
       case IES_PLUS:
       case IES_NOT:
       case IES_MULTIPLY:
       case IES_DIVIDE:
+      case IES_MOD:
       case IES_LPAREN:
       case IES_RPAREN:
       case IES_LBRAC:
@@ -432,11 +471,12 @@ private:
       case IES_INTEGER:
       case IES_REGISTER:
         State = IES_MINUS;
-        // Only push the minus operator if it is not a unary operator.
-        if (!(CurrState == IES_PLUS || CurrState == IES_MINUS ||
-              CurrState == IES_MULTIPLY || CurrState == IES_DIVIDE ||
-              CurrState == IES_LPAREN || CurrState == IES_LBRAC))
+        // push minus operator if it is not a negate operator
+        if (CurrState == IES_REGISTER || CurrState == IES_RPAREN ||
+            CurrState == IES_INTEGER  || CurrState == IES_RBRAC)
           IC.pushOperator(IC_MINUS);
+        else
+          IC.pushOperator(IC_NEG);
         if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
           // If we already have a BaseReg, then assume this is the IndexReg with
           // a scale of 1.
@@ -458,9 +498,21 @@ private:
       default:
         State = IES_ERROR;
         break;
+      case IES_OR:
+      case IES_XOR:
+      case IES_AND:
+      case IES_LSHIFT:
+      case IES_RSHIFT:
       case IES_PLUS:
+      case IES_MINUS:
       case IES_NOT:
+      case IES_MULTIPLY:
+      case IES_DIVIDE:
+      case IES_MOD:
+      case IES_LPAREN:
+      case IES_LBRAC:
         State = IES_NOT;
+        IC.pushOperator(IC_NOT);
         break;
       }
       PrevState = CurrState;
@@ -525,6 +577,7 @@ private:
       case IES_LSHIFT:
       case IES_RSHIFT:
       case IES_DIVIDE:
+      case IES_MOD:
       case IES_MULTIPLY:
       case IES_LPAREN:
         State = IES_INTEGER;
@@ -539,26 +592,6 @@ private:
           }
           // Get the scale and replace the 'Register * Scale' with '0'.
           IC.popOperator();
-        } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
-                    PrevState == IES_OR || PrevState == IES_AND ||
-                    PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
-                    PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
-                    PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
-                    PrevState == IES_NOT || PrevState == IES_XOR) &&
-                   CurrState == IES_MINUS) {
-          // Unary minus.  No need to pop the minus operand because it was never
-          // pushed.
-          IC.pushOperand(IC_IMM, -TmpInt); // Push -Imm.
-        } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
-                    PrevState == IES_OR || PrevState == IES_AND ||
-                    PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
-                    PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
-                    PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
-                    PrevState == IES_NOT || PrevState == IES_XOR) &&
-                   CurrState == IES_NOT) {
-          // Unary not.  No need to pop the not operand because it was never
-          // pushed.
-          IC.pushOperand(IC_IMM, ~TmpInt); // Push ~Imm.
         } else {
           IC.pushOperand(IC_IMM, TmpInt);
         }
@@ -594,6 +627,19 @@ private:
         break;
       }
     }
+    void onMod() {
+      PrevState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+        State = IES_MOD;
+        IC.pushOperator(IC_MOD);
+        break;
+      }
+    }
     void onLBrac() {
       PrevState = State;
       switch (State) {
@@ -647,18 +693,8 @@ private:
       case IES_RSHIFT:
       case IES_MULTIPLY:
       case IES_DIVIDE:
+      case IES_MOD:
       case IES_LPAREN:
-        // FIXME: We don't handle this type of unary minus or not, yet.
-        if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
-            PrevState == IES_OR || PrevState == IES_AND ||
-            PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
-            PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
-            PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
-            PrevState == IES_NOT || PrevState == IES_XOR) &&
-            (CurrState == IES_MINUS || CurrState == IES_NOT)) {
-          State = IES_ERROR;
-          break;
-        }
         State = IES_LPAREN;
         IC.pushOperator(IC_LPAREN);
         break;
@@ -1302,6 +1338,8 @@ bool X86AsmParser::ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine
     SM.onXor();
   else if (Name.equals_lower("and"))
     SM.onAnd();
+  else if (Name.equals_lower("mod"))
+    SM.onMod();
   else
     return false;
   return true;
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index caf98bffb80d..8f2017e990c5 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -396,7 +396,7 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
     if (!SB->getFragment()) {
       Asm.getContext().reportError(
           Fixup.getLoc(),
-          "symbol '" + B->getSymbol().getName() +
+          "symbol '" + SB->getName() +
               "' can not be undefined in a subtraction expression");
       return false;
     }
@@ -408,7 +408,7 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
     // pedantic compatibility with 'as'.
     Type = A->isExternal() ? (unsigned)MachO::GENERIC_RELOC_SECTDIFF
                            : (unsigned)MachO::GENERIC_RELOC_LOCAL_SECTDIFF;
-    Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
+    Value2 = Writer->getSymbolAddress(*SB, Layout);
     FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
   }
 
@@ -468,8 +468,8 @@ void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer,
                                                const MCFixup &Fixup,
                                                MCValue Target,
                                                uint64_t &FixedValue) {
-  assert(Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP &&
-         !is64Bit() &&
+  const MCSymbolRefExpr *SymA = Target.getSymA();
+  assert(SymA->getKind() == MCSymbolRefExpr::VK_TLVP && !is64Bit() &&
          "Should only be called with a 32-bit TLVP relocation!");
 
   unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
@@ -480,15 +480,14 @@ void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer,
   // subtraction from the picbase. For 32-bit pic the addend is the difference
   // between the picbase and the next address.  For 32-bit static the addend is
   // zero.
-  if (Target.getSymB()) {
+  if (auto *SymB = Target.getSymB()) {
     // If this is a subtraction then we're pcrel.
     uint32_t FixupAddress =
       Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
     IsPCRel = 1;
-    FixedValue =
-        FixupAddress -
-        Writer->getSymbolAddress(Target.getSymB()->getSymbol(), Layout) +
-        Target.getConstant();
+    FixedValue = FixupAddress -
+                 Writer->getSymbolAddress(SymB->getSymbol(), Layout) +
+                 Target.getConstant();
     FixedValue += 1ULL << Log2Size;
   } else {
     FixedValue = 0;
@@ -499,8 +498,7 @@ void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer,
   MRE.r_word0 = Value;
   MRE.r_word1 =
       (IsPCRel << 24) | (Log2Size << 25) | (MachO::GENERIC_RELOC_TLV << 28);
-  Writer->addRelocation(&Target.getSymA()->getSymbol(), Fragment->getParent(),
-                        MRE);
+  Writer->addRelocation(&SymA->getSymbol(), Fragment->getParent(), MRE);
 }
 
 void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 5892f1de33ee..807f7a6ddb19 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -44,7 +44,7 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
                                               const MCAsmBackend &MAB) const {
   unsigned FixupKind = Fixup.getKind();
   if (IsCrossSection) {
-    if (FixupKind != FK_Data_4) {
+    if (FixupKind != FK_Data_4 && FixupKind != llvm::X86::reloc_signed_4byte) {
       Ctx.reportError(Fixup.getLoc(), "Cannot represent this expression");
       return COFF::IMAGE_REL_AMD64_ADDR32;
     }
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index fe105298f5c1..7437ebacfac3 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -300,6 +300,8 @@ def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
                     "Intel Atom processors">;
 def ProcIntelSLM  : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
                     "Intel Silvermont processors">;
+def ProcIntelGLM  : SubtargetFeature<"glm", "X86ProcFamily", "IntelGLM",
+                    "Intel Goldmont processors">;
 
 class Proc<string Name, list<SubtargetFeature> Features>
  : ProcessorModel<Name, GenericModel, Features>;
@@ -430,6 +432,34 @@ class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
 def : SilvermontProc<"silvermont">;
 def : SilvermontProc<"slm">; // Legacy alias.
 
+class GoldmontProc<string Name> : ProcessorModel<Name, SLMModel, [
+  ProcIntelGLM,
+  FeatureX87,
+  FeatureMMX,
+  FeatureSSE42,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureMOVBE,
+  FeaturePOPCNT,
+  FeaturePCLMUL,
+  FeatureAES,
+  FeaturePRFCHW,
+  FeatureCallRegIndirect,
+  FeatureSlowLEA,
+  FeatureSlowIncDec,
+  FeatureSlowBTMem,
+  FeatureLAHFSAHF,
+  FeatureMPX,
+  FeatureSHA,
+  FeatureRDSEED,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureXSAVEC,
+  FeatureXSAVES,
+  FeatureCLFLUSHOPT
+]>;
+def : GoldmontProc<"goldmont">;
+
 // "Arrandale" along with corei3 and corei5
 class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
   FeatureX87,
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index f777e5628988..b89914f8893e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5065,6 +5065,20 @@ static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
 }
 
+// Return true if the instruction zeroes the unused upper part of the
+// destination and accepts mask.
+static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case X86ISD::PCMPEQM:
+  case X86ISD::PCMPGTM:
+  case X86ISD::CMPM:
+  case X86ISD::CMPMU:
+    return true;
+  }
+}
+
 /// Insert i1-subvector to i1-vector.
 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
                                 const X86Subtarget &Subtarget) {
@@ -5097,6 +5111,22 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   // 3. Subvector should be inserted in the middle (for example v2i1
   //    to v16i1, index 2)
 
+  // If this node widens - by concatenating zeroes - the type of the result
+  // of a node with instruction that zeroes all upper (irrelevant) bits of the
+  // output register, mark this node as legal to enable replacing them with
+  // the v8i1 version of the previous instruction during instruction selection.
+  // For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg,
+  // while zeroing all the upper remaining 60 bits of the register. if the
+  // result of such instruction is inserted into an allZeroVector, then we can
+  // safely remove insert_vector (in instruction selection) as the cmp instr
+  // already zeroed the rest of the register.
+  if (ISD::isBuildVectorAllZeros(Vec.getNode()) && IdxVal == 0 &&
+      (isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) ||
+       (SubVec.getOpcode() == ISD::AND &&
+        (isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) ||
+         isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode())))))
+    return Op;
+
   // extend to natively supported kshift
   MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
   MVT WideOpVT = OpVT;
@@ -7919,6 +7949,60 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
 }
 
+// Return true if all the operands of the given CONCAT_VECTORS node are zeros
+// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
+static bool isExpandWithZeros(const SDValue &Op) {
+  assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
+         "Expand with zeros only possible in CONCAT_VECTORS nodes!");
+
+  for (unsigned i = 1; i < Op.getNumOperands(); i++)
+    if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
+      return false;
+
+  return true;
+}
+
+// Returns true if the given node is a type promotion (by concatenating i1
+// zeros) of the result of a node that already zeros all upper bits of
+// k-register.
+static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
+  unsigned Opc = Op.getOpcode();
+
+  assert(Opc == ISD::CONCAT_VECTORS &&
+         Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
+         "Unexpected node to check for type promotion!");
+
+  // As long as we are concatenating zeros to the upper part of a previous node
+  // result, climb up the tree until a node with different opcode is
+  // encountered
+  while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
+    if (Opc == ISD::INSERT_SUBVECTOR) {
+      if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
+          Op.getConstantOperandVal(2) == 0)
+        Op = Op.getOperand(1);
+      else
+        return SDValue();
+    } else { // Opc == ISD::CONCAT_VECTORS
+      if (isExpandWithZeros(Op))
+        Op = Op.getOperand(0);
+      else
+        return SDValue();
+    }
+    Opc = Op.getOpcode();
+  }
+
+  // Check if the first inserted node zeroes the upper bits, or an 'and' result
+  // of a node that zeros the upper bits (its masked version).
+  if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
+      (Op.getOpcode() == ISD::AND &&
+       (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
+        isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
+    return Op;
+  }
+
+  return SDValue();
+}
+
 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG & DAG) {
@@ -7929,6 +8013,17 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
   assert(isPowerOf2_32(NumOfOperands) &&
          "Unexpected number of operands in CONCAT_VECTORS");
 
+  // If this node promotes - by concatenating zeroes - the type of the result
+  // of a node with instruction that zeroes all upper (irrelevant) bits of the
+  // output register, mark it as legal and catch the pattern in instruction
+  // selection to avoid emitting extra insturctions (for zeroing upper bits).
+  if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
+    SDValue ZeroC = DAG.getConstant(0, dl, MVT::i64);
+    SDValue AllZeros = DAG.getSplatBuildVector(ResVT, dl, ZeroC);
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
+                       ZeroC);
+  }
+
   SDValue Undef = DAG.getUNDEF(ResVT);
   if (NumOfOperands > 2) {
     // Specialize the cases when all, or all but one, of the operands are undef.
@@ -27012,6 +27107,9 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
                                            unsigned &Shuffle, MVT &ShuffleVT,
                                            unsigned &PermuteImm) {
   unsigned NumMaskElts = Mask.size();
+  unsigned InputSizeInBits = MaskVT.getSizeInBits();
+  unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
+  MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
 
   bool ContainsZeros = false;
   APInt Zeroable(NumMaskElts, false);
@@ -27027,7 +27125,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
     int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
-                                             MaskVT.getScalarSizeInBits(), Mask,
+                                             MaskScalarSizeInBits, Mask,
                                              0, Zeroable, Subtarget);
     if (0 < ShiftAmt) {
       PermuteImm = (unsigned)ShiftAmt;
@@ -27043,10 +27141,6 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
                         return SM_SentinelUndef <= M && M < (int)NumMaskElts;
                       }) && "Expected unary shuffle");
 
-  unsigned InputSizeInBits = MaskVT.getSizeInBits();
-  unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
-  MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
-
   // Handle PSHUFLW/PSHUFHW repeated patterns.
   if (MaskScalarSizeInBits == 16) {
     SmallVector<int, 4> RepeatedMask;
@@ -35072,7 +35166,7 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
 /// that is commonly recognized as an idiom (has no register dependency), so
 /// that's better/smaller than loading a splat 1 constant.
 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
-  assert(N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB &&
+  assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
          "Unexpected opcode for increment/decrement transform");
 
   // Pseudo-legality check: getOnesVector() expects one of these types, so bail
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 01a70323224c..cc5c09cbf0e5 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -185,6 +185,20 @@ def avx512vl_f32_info : AVX512VLVectorVTInfo<v16f32_info, v8f32x_info,
 def avx512vl_f64_info : AVX512VLVectorVTInfo<v8f64_info, v4f64x_info,
                                              v2f64x_info>;
 
+class X86KVectorVTInfo<RegisterClass _krc, RegisterClass _krcwm,
+                       ValueType _vt> {
+  RegisterClass KRC = _krc;
+  RegisterClass KRCWM = _krcwm;
+  ValueType KVT = _vt;
+}
+
+def v2i1_info : X86KVectorVTInfo<VK2, VK2WM, v2i1>;
+def v4i1_info : X86KVectorVTInfo<VK4, VK4WM, v4i1>;
+def v8i1_info : X86KVectorVTInfo<VK8, VK8WM, v8i1>;
+def v16i1_info : X86KVectorVTInfo<VK16, VK16WM, v16i1>;
+def v32i1_info : X86KVectorVTInfo<VK32, VK32WM, v32i1>;
+def v64i1_info : X86KVectorVTInfo<VK64, VK64WM, v64i1>;
+
 // This multiclass generates the masking variants from the non-masking
 // variant.  It only provides the assembly pieces for the masking variants.
 // It assumes custom ISel patterns for masking which can be provided as
@@ -1735,17 +1749,217 @@ defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm,
                       avx512vl_i64_info, HasAVX512>,
                 T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
-let Predicates = [HasAVX512, NoVLX] in {
-def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
-            (COPY_TO_REGCLASS (VPCMPGTDZrr
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
 
-def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
-            (COPY_TO_REGCLASS (VPCMPEQDZrr
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
-}
+multiclass avx512_icmp_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
+                                       SDNode OpNode, string InstrStr,
+                                       list<Predicate> Preds> {
+let Predicates = Preds in {
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rr) _.RC:$src1, _.RC:$src2),
+                              NewInf.KRC)>;
+  
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (OpNode (_.VT _.RC:$src1), 
+                                             (_.VT (bitconvert (_.LdFrag addr:$src2))))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rm) _.RC:$src1, addr:$src2),
+                              NewInf.KRC)>;
+  
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and _.KRCWM:$mask, 
+                                          (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrk) _.KRCWM:$mask,
+                                                                _.RC:$src1, _.RC:$src2),
+                              NewInf.KRC)>;
+  
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and (_.KVT _.KRCWM:$mask), 
+                                          (_.KVT (OpNode (_.VT _.RC:$src1), 
+                                                         (_.VT (bitconvert 
+                                                                (_.LdFrag addr:$src2))))))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmk) _.KRCWM:$mask, 
+                                                                 _.RC:$src1, addr:$src2),
+                              NewInf.KRC)>;
+}
+}
+
+multiclass avx512_icmp_packed_rmb_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
+                                           SDNode OpNode, string InstrStr,
+                                           list<Predicate> Preds>
+         : avx512_icmp_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> {
+let Predicates = Preds in {
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (OpNode (_.VT _.RC:$src1),
+                                            (X86VBroadcast (_.ScalarLdFrag addr:$src2)))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmb) _.RC:$src1, addr:$src2),
+                              NewInf.KRC)>;
+  
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and (_.KVT _.KRCWM:$mask),
+                                          (_.KVT (OpNode (_.VT _.RC:$src1),
+                                                         (X86VBroadcast
+                                                          (_.ScalarLdFrag addr:$src2)))))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmbk) _.KRCWM:$mask,
+                                                                  _.RC:$src1, addr:$src2),
+                              NewInf.KRC)>;
+}
+}
+
+// VPCMPEQB - i8
+defm : avx512_icmp_packed_lowering<v16i8x_info, v32i1_info, X86pcmpeqm,
+                                   "VPCMPEQBZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v16i8x_info, v64i1_info, X86pcmpeqm,
+                                   "VPCMPEQBZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_packed_lowering<v32i8x_info, v64i1_info, X86pcmpeqm,
+                                   "VPCMPEQBZ256", [HasBWI, HasVLX]>;
+
+// VPCMPEQW - i16
+defm : avx512_icmp_packed_lowering<v8i16x_info, v16i1_info, X86pcmpeqm,
+                                   "VPCMPEQWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v8i16x_info, v32i1_info, X86pcmpeqm,
+                                   "VPCMPEQWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v8i16x_info, v64i1_info, X86pcmpeqm,
+                                   "VPCMPEQWZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_packed_lowering<v16i16x_info, v32i1_info, X86pcmpeqm,
+                                   "VPCMPEQWZ256", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v16i16x_info, v64i1_info, X86pcmpeqm,
+                                   "VPCMPEQWZ256", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_packed_lowering<v32i16_info, v64i1_info, X86pcmpeqm,
+                                   "VPCMPEQWZ", [HasBWI]>;
+
+// VPCMPEQD - i32
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v8i1_info,  X86pcmpeqm,
+                                       "VPCMPEQDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v16i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v32i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v64i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v16i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v32i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v64i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v32i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ", [HasAVX512]>;
+defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v64i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ", [HasAVX512]>;
+
+// VPCMPEQQ - i64
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v4i1_info,  X86pcmpeqm,
+                                       "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v8i1_info,  X86pcmpeqm,
+                                       "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v16i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v32i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v64i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v8i1_info,  X86pcmpeqm,
+                                       "VPCMPEQQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v16i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v32i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v64i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpeqm, 
+                                       "VPCMPEQQZ", [HasAVX512]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v32i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ", [HasAVX512]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v64i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ", [HasAVX512]>;
+
+// VPCMPGTB - i8
+defm : avx512_icmp_packed_lowering<v16i8x_info, v32i1_info, X86pcmpgtm,
+                                   "VPCMPGTBZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v16i8x_info, v64i1_info, X86pcmpgtm,
+                                   "VPCMPGTBZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_packed_lowering<v32i8x_info, v64i1_info, X86pcmpgtm,
+                                   "VPCMPGTBZ256", [HasBWI, HasVLX]>;
+
+// VPCMPGTW - i16
+defm : avx512_icmp_packed_lowering<v8i16x_info,  v16i1_info, X86pcmpgtm,
+                                   "VPCMPGTWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v8i16x_info,  v32i1_info, X86pcmpgtm,
+                                   "VPCMPGTWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v8i16x_info,  v64i1_info, X86pcmpgtm,
+                                   "VPCMPGTWZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_packed_lowering<v16i16x_info, v32i1_info, X86pcmpgtm,
+                                   "VPCMPGTWZ256", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v16i16x_info, v64i1_info, X86pcmpgtm,
+                                   "VPCMPGTWZ256", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_packed_lowering<v32i16_info,  v64i1_info, X86pcmpgtm,
+                                   "VPCMPGTWZ", [HasBWI]>;
+
+// VPCMPGTD - i32
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info,  v8i1_info,  X86pcmpgtm,
+                                       "VPCMPGTDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info,  v16i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info,  v32i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info,  v64i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v8i32x_info,  v16i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i32x_info,  v32i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i32x_info,  v64i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v32i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ", [HasAVX512]>;
+defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v64i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ", [HasAVX512]>;
+
+// VPCMPGTQ - i64
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v4i1_info,  X86pcmpgtm,
+                                       "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v8i1_info,  X86pcmpgtm,
+                                       "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v16i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v32i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v64i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v8i1_info,  X86pcmpgtm,
+                                       "VPCMPGTQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v16i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v32i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v64i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ", [HasAVX512]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v32i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ", [HasAVX512]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v64i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ", [HasAVX512]>;
 
 multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
                           X86VectorVTInfo _> {
@@ -1908,6 +2122,237 @@ defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info,
 defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info,
                                      HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
 
+multiclass avx512_icmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
+                                          SDNode OpNode, string InstrStr,
+                                          list<Predicate> Preds> {
+let Predicates = Preds in {
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (OpNode (_.VT _.RC:$src1), 
+                                             (_.VT _.RC:$src2), 
+                                             imm:$cc)),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1, 
+                                                                 _.RC:$src2,
+                                                                 imm:$cc),
+                              NewInf.KRC)>;
+  
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (OpNode (_.VT _.RC:$src1), 
+                                             (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                                             imm:$cc)),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmi) _.RC:$src1,
+                                                                 addr:$src2,
+                                                                 imm:$cc),
+                              NewInf.KRC)>;
+  
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and _.KRCWM:$mask, 
+                                          (OpNode (_.VT _.RC:$src1),
+                                                  (_.VT _.RC:$src2),
+                                                  imm:$cc))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrik) _.KRCWM:$mask,
+                                                                  _.RC:$src1, 
+                                                                  _.RC:$src2,
+                                                                  imm:$cc),
+                              NewInf.KRC)>;
+  
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and (_.KVT _.KRCWM:$mask), 
+                                          (_.KVT (OpNode (_.VT _.RC:$src1), 
+                                                         (_.VT (bitconvert 
+                                                                (_.LdFrag addr:$src2))),
+                                                         imm:$cc)))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmik) _.KRCWM:$mask, 
+                                                                  _.RC:$src1,
+                                                                  addr:$src2,
+                                                                  imm:$cc),
+                              NewInf.KRC)>;
+}
+}
+  
+multiclass avx512_icmp_cc_packed_rmb_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
+                                              SDNode OpNode, string InstrStr,
+                                              list<Predicate> Preds> 
+         : avx512_icmp_cc_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> {
+let Predicates = Preds in {
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (OpNode (_.VT _.RC:$src1),
+                                             (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+                                             imm:$cc)),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmib) _.RC:$src1,
+                                                                  addr:$src2,
+                                                                  imm:$cc),
+                              NewInf.KRC)>;
+  
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and (_.KVT _.KRCWM:$mask),
+                                          (_.KVT (OpNode (_.VT _.RC:$src1),
+                                                         (X86VBroadcast
+                                                            (_.ScalarLdFrag addr:$src2)),
+                                                         imm:$cc)))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmibk) _.KRCWM:$mask,
+                                                                   _.RC:$src1,
+                                                                   addr:$src2,
+                                                                   imm:$cc),
+                              NewInf.KRC)>;
+}
+}
+
+// VPCMPB - i8
+defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v32i1_info, X86cmpm,
+                                      "VPCMPBZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v64i1_info, X86cmpm,
+                                      "VPCMPBZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_lowering<v32i8x_info, v64i1_info, X86cmpm,
+                                      "VPCMPBZ256", [HasBWI, HasVLX]>;
+
+// VPCMPW - i16
+defm : avx512_icmp_cc_packed_lowering<v8i16x_info,  v16i1_info, X86cmpm,
+                                      "VPCMPWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v8i16x_info,  v32i1_info, X86cmpm,
+                                      "VPCMPWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v8i16x_info,  v64i1_info, X86cmpm,
+                                      "VPCMPWZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v32i1_info, X86cmpm,
+                                      "VPCMPWZ256", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v64i1_info, X86cmpm,
+                                      "VPCMPWZ256", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_lowering<v32i16_info, v64i1_info, X86cmpm,
+                                      "VPCMPWZ", [HasBWI]>;
+
+// VPCMPD - i32
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v8i1_info,   X86cmpm,
+                                          "VPCMPDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v16i1_info,  X86cmpm,
+                                          "VPCMPDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v32i1_info,  X86cmpm,
+                                          "VPCMPDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v64i1_info,  X86cmpm,
+                                          "VPCMPDZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info,  v16i1_info, X86cmpm,
+                                          "VPCMPDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info,  v32i1_info, X86cmpm,
+                                          "VPCMPDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info,  v64i1_info, X86cmpm,
+                                          "VPCMPDZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v32i1_info, X86cmpm,
+                                          "VPCMPDZ", [HasAVX512]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v64i1_info, X86cmpm,
+                                          "VPCMPDZ", [HasAVX512]>;
+
+// VPCMPQ - i64
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v4i1_info,   X86cmpm,
+                                          "VPCMPQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v8i1_info,   X86cmpm,
+                                          "VPCMPQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v16i1_info,  X86cmpm,
+                                          "VPCMPQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v32i1_info,  X86cmpm,
+                                          "VPCMPQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v64i1_info,  X86cmpm,
+                                          "VPCMPQZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v8i1_info,   X86cmpm,
+                                          "VPCMPQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v16i1_info,  X86cmpm,
+                                          "VPCMPQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v32i1_info,  X86cmpm,
+                                          "VPCMPQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v64i1_info,  X86cmpm,
+                                          "VPCMPQZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v16i1_info, X86cmpm,
+                                          "VPCMPQZ", [HasAVX512]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v32i1_info, X86cmpm,
+                                          "VPCMPQZ", [HasAVX512]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v64i1_info, X86cmpm,
+                                          "VPCMPQZ", [HasAVX512]>;
+
+// VPCMPUB - i8
+defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v32i1_info, X86cmpmu,
+                                      "VPCMPUBZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v64i1_info, X86cmpmu,
+                                      "VPCMPUBZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_lowering<v32i8x_info, v64i1_info, X86cmpmu,
+                                      "VPCMPUBZ256", [HasBWI, HasVLX]>;
+
+// VPCMPUW - i16
+defm : avx512_icmp_cc_packed_lowering<v8i16x_info,  v16i1_info, X86cmpmu,
+                                      "VPCMPUWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v8i16x_info,  v32i1_info, X86cmpmu,
+                                      "VPCMPUWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v8i16x_info,  v64i1_info, X86cmpmu,
+                                      "VPCMPUWZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v32i1_info, X86cmpmu,
+                                      "VPCMPUWZ256", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v64i1_info, X86cmpmu,
+                                      "VPCMPUWZ256", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_lowering<v32i16_info, v64i1_info, X86cmpmu,
+                                      "VPCMPUWZ", [HasBWI]>;
+
+// VPCMPUD - i32
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v8i1_info,   X86cmpmu,
+                                          "VPCMPUDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v16i1_info,  X86cmpmu,
+                                          "VPCMPUDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v32i1_info,  X86cmpmu,
+                                          "VPCMPUDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v64i1_info,  X86cmpmu,
+                                          "VPCMPUDZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info,  v16i1_info, X86cmpmu,
+                                          "VPCMPUDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info,  v32i1_info, X86cmpmu,
+                                          "VPCMPUDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info,  v64i1_info, X86cmpmu,
+                                          "VPCMPUDZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v32i1_info, X86cmpmu,
+                                          "VPCMPUDZ", [HasAVX512]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v64i1_info, X86cmpmu,
+                                          "VPCMPUDZ", [HasAVX512]>;
+
+// VPCMPUQ - i64
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v4i1_info,   X86cmpmu,
+                                          "VPCMPUQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v8i1_info,   X86cmpmu,
+                                          "VPCMPUQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v16i1_info,  X86cmpmu,
+                                          "VPCMPUQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v32i1_info,  X86cmpmu,
+                                          "VPCMPUQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v64i1_info,  X86cmpmu,
+                                          "VPCMPUQZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v8i1_info,   X86cmpmu,
+                                          "VPCMPUQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v16i1_info,  X86cmpmu,
+                                          "VPCMPUQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v32i1_info,  X86cmpmu,
+                                          "VPCMPUQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v64i1_info,  X86cmpmu,
+                                          "VPCMPUQZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v16i1_info, X86cmpmu,
+                                          "VPCMPUQZ", [HasAVX512]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v32i1_info, X86cmpmu,
+                                          "VPCMPUQZ", [HasAVX512]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v64i1_info, X86cmpmu,
+                                          "VPCMPUQZ", [HasAVX512]>;
+
 multiclass avx512_vcmp_common<X86VectorVTInfo _> {
 
   defm  rri  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
@@ -1998,21 +2443,108 @@ defm VCMPPD : avx512_vcmp<avx512vl_f64_info>,
 defm VCMPPS : avx512_vcmp<avx512vl_f32_info>,
                           AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
 
-def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)),
-          (COPY_TO_REGCLASS (VCMPPSZrri
-            (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-            (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
-            imm:$cc), VK8)>;
-def : Pat<(v8i1 (X86cmpm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
-          (COPY_TO_REGCLASS (VPCMPDZrri
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
-            imm:$cc), VK8)>;
-def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
-          (COPY_TO_REGCLASS (VPCMPUDZrri
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
-            imm:$cc), VK8)>;
+multiclass avx512_fcmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
+                                          string InstrStr, list<Predicate> Preds> {
+let Predicates = Preds in {
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (X86cmpm (_.VT _.RC:$src1), 
+                                              (_.VT _.RC:$src2), 
+                                              imm:$cc)),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1, 
+                                                                 _.RC:$src2,
+                                                                 imm:$cc),
+                              NewInf.KRC)>;
+  
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (X86cmpm (_.VT _.RC:$src1), 
+                                              (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                                              imm:$cc)),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmi) _.RC:$src1,
+                                                                 addr:$src2,
+                                                                 imm:$cc),
+                              NewInf.KRC)>;
+
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (X86cmpm (_.VT _.RC:$src1),
+                                              (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+                                              imm:$cc)),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmbi) _.RC:$src1,
+                                                                  addr:$src2,
+                                                                  imm:$cc),
+                              NewInf.KRC)>;
+}
+}
+  
+multiclass avx512_fcmp_cc_packed_sae_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
+                                              string InstrStr, list<Predicate> Preds> 
+         : avx512_fcmp_cc_packed_lowering<_, NewInf, InstrStr, Preds> {
+
+let Predicates = Preds in
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (X86cmpmRnd (_.VT _.RC:$src1), 
+                                                 (_.VT _.RC:$src2), 
+                                                 imm:$cc,
+                                                 (i32 FROUND_NO_EXC))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrib) _.RC:$src1, 
+                                                                 _.RC:$src2,
+                                                                 imm:$cc),
+                              NewInf.KRC)>;
+}
+
+
+// VCMPPS - f32
+defm : avx512_fcmp_cc_packed_lowering<v4f32x_info,  v8i1_info,  "VCMPPSZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v4f32x_info,  v16i1_info, "VCMPPSZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v4f32x_info,  v32i1_info, "VCMPPSZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v4f32x_info,  v64i1_info, "VCMPPSZ128",
+                                      [HasAVX512, HasVLX]>;
+
+defm : avx512_fcmp_cc_packed_lowering<v8f32x_info,  v16i1_info, "VCMPPSZ256",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v8f32x_info,  v32i1_info, "VCMPPSZ256",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v8f32x_info,  v64i1_info, "VCMPPSZ256",
+                                      [HasAVX512, HasVLX]>;
+
+defm : avx512_fcmp_cc_packed_sae_lowering<v16f32_info, v32i1_info, "VCMPPSZ",
+                                          [HasAVX512]>;
+defm : avx512_fcmp_cc_packed_sae_lowering<v16f32_info, v64i1_info, "VCMPPSZ",
+                                          [HasAVX512]>;
+
+// VCMPPD - f64
+defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v4i1_info,  "VCMPPDZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v8i1_info,  "VCMPPDZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v16i1_info, "VCMPPDZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v32i1_info, "VCMPPDZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v64i1_info, "VCMPPDZ128",
+                                      [HasAVX512, HasVLX]>;
+
+defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v8i1_info,  "VCMPPDZ256",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v16i1_info, "VCMPPDZ256",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v32i1_info, "VCMPPDZ256",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v64i1_info, "VCMPPDZ256",
+                                      [HasAVX512, HasVLX]>;
+
+defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v16i1_info, "VCMPPDZ",
+                                          [HasAVX512]>;
+defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v32i1_info, "VCMPPDZ",
+                                          [HasAVX512]>;
+defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v64i1_info, "VCMPPDZ",
+                                          [HasAVX512]>;
 
 // ----------------------------------------------------------------
 // FPClass
@@ -2498,6 +3030,69 @@ multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
 defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl>;
 defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr>;
 
+multiclass axv512_icmp_packed_no_vlx_lowering<SDNode OpNode, string InstStr> {
+def : Pat<(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrr)
+            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
+
+def : Pat<(insert_subvector (v16i1 immAllZerosV), 
+                            (v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+                            (i64 0)),
+            (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrr)
+                     (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                     (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+            (i8 8)), (i8 8))>;
+
+def : Pat<(insert_subvector (v16i1 immAllZerosV), 
+                            (v8i1 (and VK8:$mask, 
+                                       (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2)))),
+                            (i64 0)),
+            (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrk)
+                     (COPY_TO_REGCLASS VK8:$mask, VK16),
+                     (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                     (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+            (i8 8)), (i8 8))>;
+}
+
+multiclass axv512_icmp_packed_cc_no_vlx_lowering<SDNode OpNode, string InstStr,
+                                                AVX512VLVectorVTInfo _> {
+def : Pat<(v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrri)
+            (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+            (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+
+def : Pat<(insert_subvector (v16i1 immAllZerosV), 
+                            (v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)),
+                            (i64 0)),
+            (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrri)
+                     (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                     (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+                     imm:$cc),
+            (i8 8)), (i8 8))>;
+
+def : Pat<(insert_subvector (v16i1 immAllZerosV), 
+                            (v8i1 (and VK8:$mask, 
+                                       (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc))),
+                            (i64 0)),
+            (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrik)
+                     (COPY_TO_REGCLASS VK8:$mask, VK16),
+                     (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                     (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+                     imm:$cc),
+            (i8 8)), (i8 8))>;
+}
+
+let Predicates = [HasAVX512, NoVLX] in {
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD">;
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQD">;
+
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", avx512vl_f32_info>;
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VPCMPD", avx512vl_i32_info>;
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpmu, "VPCMPUD", avx512vl_i32_info>;
+}
+
 // Mask setting all 0s or 1s
 multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
   let Predicates = [HasAVX512] in
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index f98c2a7e802d..e34a90e975b8 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -75,6 +75,8 @@ private:
   bool selectUadde(MachineInstr &I, MachineRegisterInfo &MRI,
                    MachineFunction &MF) const;
   bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const;
+  bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI,
+                         MachineFunction &MF) const;
   bool selectInsert(MachineInstr &I, MachineRegisterInfo &MRI,
                     MachineFunction &MF) const;
   bool selectExtract(MachineInstr &I, MachineRegisterInfo &MRI,
@@ -270,6 +272,8 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
     return true;
   if (selectUadde(I, MRI, MF))
     return true;
+  if (selectMergeValues(I, MRI, MF))
+    return true;
   if (selectExtract(I, MRI, MF))
     return true;
   if (selectInsert(I, MRI, MF))
@@ -914,6 +918,55 @@ bool X86InstructionSelector::selectInsert(MachineInstr &I,
   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
 }
 
+bool X86InstructionSelector::selectMergeValues(MachineInstr &I,
+                                               MachineRegisterInfo &MRI,
+                                               MachineFunction &MF) const {
+  if (I.getOpcode() != TargetOpcode::G_MERGE_VALUES)
+    return false;
+
+  // Split to inserts.
+  unsigned DstReg = I.getOperand(0).getReg();
+  unsigned SrcReg0 = I.getOperand(1).getReg();
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT SrcTy = MRI.getType(SrcReg0);
+  unsigned SrcSize = SrcTy.getSizeInBits();
+
+  const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+
+  // For the first src use insertSubReg.
+  unsigned DefReg = MRI.createGenericVirtualRegister(DstTy);
+  MRI.setRegBank(DefReg, RegBank);
+  if (!emitInsertSubreg(DefReg, I.getOperand(1).getReg(), I, MRI, MF))
+    return false;
+
+  for (unsigned Idx = 2; Idx < I.getNumOperands(); ++Idx) {
+
+    unsigned Tmp = MRI.createGenericVirtualRegister(DstTy);
+    MRI.setRegBank(Tmp, RegBank);
+
+    MachineInstr &InsertInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                                        TII.get(TargetOpcode::G_INSERT), Tmp)
+                                    .addReg(DefReg)
+                                    .addReg(I.getOperand(Idx).getReg())
+                                    .addImm((Idx - 1) * SrcSize);
+
+    DefReg = Tmp;
+
+    if (!select(InsertInst))
+      return false;
+  }
+
+  MachineInstr &CopyInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                                    TII.get(TargetOpcode::COPY), DstReg)
+                                .addReg(DefReg);
+
+  if (!select(CopyInst))
+    return false;
+
+  I.eraseFromParent();
+  return true;
+}
 InstructionSelector *
 llvm::createX86InstructionSelector(const X86TargetMachine &TM,
                                    X86Subtarget &Subtarget,
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index a584eabcc1b2..a5fa3340c3f1 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -56,7 +56,7 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
   const LLT s32 = LLT::scalar(32);
   const LLT s64 = LLT::scalar(64);
 
-  for (unsigned BinOp : {G_ADD, G_SUB, G_MUL})
+  for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
     for (auto Ty : {s8, s16, s32})
       setAction({BinOp, Ty}, Legal);
 
@@ -117,7 +117,7 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
   const LLT s32 = LLT::scalar(32);
   const LLT s64 = LLT::scalar(64);
 
-  for (unsigned BinOp : {G_ADD, G_SUB, G_MUL})
+  for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
     for (auto Ty : {s8, s16, s32, s64})
       setAction({BinOp, Ty}, Legal);
 
@@ -228,10 +228,14 @@ void X86LegalizerInfo::setLegalizerInfoAVX() {
     for (auto Ty : {v8s32, v4s64})
       setAction({MemOp, Ty}, Legal);
 
-  for (auto Ty : {v32s8, v16s16, v8s32, v4s64})
+  for (auto Ty : {v32s8, v16s16, v8s32, v4s64}) {
     setAction({G_INSERT, Ty}, Legal);
-  for (auto Ty : {v16s8, v8s16, v4s32, v2s64})
+    setAction({G_EXTRACT, 1, Ty}, Legal);
+  }
+  for (auto Ty : {v16s8, v8s16, v4s32, v2s64}) {
     setAction({G_INSERT, 1, Ty}, Legal);
+    setAction({G_EXTRACT, Ty}, Legal);
+  }
 }
 
 void X86LegalizerInfo::setLegalizerInfoAVX2() {
@@ -280,10 +284,14 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() {
     for (auto Ty : {v16s32, v8s64})
       setAction({MemOp, Ty}, Legal);
 
-  for (auto Ty : {v64s8, v32s16, v16s32, v8s64})
+  for (auto Ty : {v64s8, v32s16, v16s32, v8s64}) {
     setAction({G_INSERT, Ty}, Legal);
-  for (auto Ty : {v32s8, v16s16, v8s32, v4s64, v16s8, v8s16, v4s32, v2s64})
+    setAction({G_EXTRACT, 1, Ty}, Legal);
+  }
+  for (auto Ty : {v32s8, v16s16, v8s32, v4s64, v16s8, v8s16, v4s32, v2s64}) {
     setAction({G_INSERT, 1, Ty}, Legal);
+    setAction({G_EXTRACT, Ty}, Legal);
+  }
 
   /************ VLX *******************/
   if (!Subtarget.hasVLX())
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index e36a47506ba0..24845beac22d 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -11,10 +11,23 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "X86.h"
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "X86CallLowering.h"
+#include "X86LegalizerInfo.h"
+#include "X86RegisterBankInfo.h"
+#endif
 #include "X86Subtarget.h"
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "X86TargetMachine.h"
 #include "llvm/ADT/Triple.h"
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#endif
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Function.h"
@@ -336,6 +349,35 @@ X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
   return *this;
 }
 
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+
+struct X86GISelActualAccessor : public GISelAccessor {
+  std::unique_ptr<CallLowering> CallLoweringInfo;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+
+  const CallLowering *getCallLowering() const override {
+    return CallLoweringInfo.get();
+  }
+
+  const InstructionSelector *getInstructionSelector() const override {
+    return InstSelector.get();
+  }
+
+  const LegalizerInfo *getLegalizerInfo() const override {
+    return Legalizer.get();
+  }
+
+  const RegisterBankInfo *getRegBankInfo() const override {
+    return RegBankInfo.get();
+  }
+};
+
+} // end anonymous namespace
+#endif
+
 X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
                            const X86TargetMachine &TM,
                            unsigned StackAlignOverride)
@@ -360,6 +402,19 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
     setPICStyle(PICStyles::StubPIC);
   else if (isTargetELF())
     setPICStyle(PICStyles::GOT);
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+  GISelAccessor *GISel = new GISelAccessor();
+#else
+  X86GISelActualAccessor *GISel = new X86GISelActualAccessor();
+
+  GISel->CallLoweringInfo.reset(new X86CallLowering(*getTargetLowering()));
+  GISel->Legalizer.reset(new X86LegalizerInfo(*this, TM));
+
+  auto *RBI = new X86RegisterBankInfo(*getRegisterInfo());
+  GISel->RegBankInfo.reset(RBI);
+  GISel->InstSelector.reset(createX86InstructionSelector(TM, *this, *RBI));
+#endif
+  setGISelAccessor(*GISel);
 }
 
 const CallLowering *X86Subtarget::getCallLowering() const {
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 550e95c39ab5..fa0afe29586b 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -58,7 +58,7 @@ protected:
   };
 
   enum X86ProcFamilyEnum {
-    Others, IntelAtom, IntelSLM
+    Others, IntelAtom, IntelSLM, IntelGLM
   };
 
   /// X86 processor family: Intel Atom, and others
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index a9f42cacf788..8d891c983fab 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -15,9 +15,6 @@
 #include "X86.h"
 #include "X86CallLowering.h"
 #include "X86LegalizerInfo.h"
-#ifdef LLVM_BUILD_GLOBAL_ISEL
-#include "X86RegisterBankInfo.h"
-#endif
 #include "X86MacroFusion.h"
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
@@ -31,7 +28,6 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/ExecutionDepsFix.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
-#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
@@ -212,35 +208,6 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
 
 X86TargetMachine::~X86TargetMachine() = default;
 
-#ifdef LLVM_BUILD_GLOBAL_ISEL
-namespace {
-
-struct X86GISelActualAccessor : public GISelAccessor {
-  std::unique_ptr<CallLowering> CallLoweringInfo;
-  std::unique_ptr<LegalizerInfo> Legalizer;
-  std::unique_ptr<RegisterBankInfo> RegBankInfo;
-  std::unique_ptr<InstructionSelector> InstSelector;
-
-  const CallLowering *getCallLowering() const override {
-    return CallLoweringInfo.get();
-  }
-
-  const InstructionSelector *getInstructionSelector() const override {
-    return InstSelector.get();
-  }
-
-  const LegalizerInfo *getLegalizerInfo() const override {
-    return Legalizer.get();
-  }
-
-  const RegisterBankInfo *getRegBankInfo() const override {
-    return RegBankInfo.get();
-  }
-};
-
-} // end anonymous namespace
-#endif
-
 const X86Subtarget *
 X86TargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
@@ -280,20 +247,6 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
     resetTargetOptions(F);
     I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,
                                         Options.StackAlignmentOverride);
-#ifndef LLVM_BUILD_GLOBAL_ISEL
-    GISelAccessor *GISel = new GISelAccessor();
-#else
-    X86GISelActualAccessor *GISel = new X86GISelActualAccessor();
-
-    GISel->CallLoweringInfo.reset(new X86CallLowering(*I->getTargetLowering()));
-    GISel->Legalizer.reset(new X86LegalizerInfo(*I, *this));
-
-    auto *RBI = new X86RegisterBankInfo(*I->getRegisterInfo());
-    GISel->RegBankInfo.reset(RBI);
-    GISel->InstSelector.reset(createX86InstructionSelector(
-        *this, *I, *RBI));
-#endif
-    I->setGISelAccessor(*GISel);
   }
   return I.get();
 }
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 1bf267d34ec2..aaa6d58bd134 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -40,6 +40,7 @@ public:
   ~X86TargetMachine() override;
 
   const X86Subtarget *getSubtargetImpl(const Function &F) const override;
+  const X86Subtarget *getSubtargetImpl() const = delete;
 
   TargetIRAnalysis getTargetIRAnalysis() override;