116 files changed, 1845 insertions, 863 deletions
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index d0c0956b87ca..629ad5c61b78 100644
--- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -942,6 +942,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
                           AArch64::XZR, NextMBBI);
   case AArch64::CMP_SWAP_128:
     return expandCMP_SWAP_128(MBB, MBBI, NextMBBI);
+
   }
   return false;
 }
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 550174b22a89..dc916c034661 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1125,7 +1125,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (RegInfo->hasBasePointer(MF))
     BasePointerReg = RegInfo->getBaseRegister();
 
-  bool ExtraCSSpill = false;
+  unsigned ExtraCSSpill = 0;
   const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
   // Figure out which callee-saved registers to save/restore.
   for (unsigned i = 0; CSRegs[i]; ++i) {
@@ -1153,7 +1153,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
       SavedRegs.set(PairedReg);
       if (AArch64::GPR64RegClass.contains(PairedReg) &&
           !RegInfo->isReservedReg(MF, PairedReg))
-        ExtraCSSpill = true;
+        ExtraCSSpill = PairedReg;
     }
   }
 
@@ -1186,8 +1186,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   // register scavenging. If we already spilled an extra callee-saved register
   // above to keep the number of spills even, we don't need to do anything else
   // here.
-  if (BigStack && !ExtraCSSpill) {
-    if (UnspilledCSGPR != AArch64::NoRegister) {
+  if (BigStack) {
+    if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
       DEBUG(dbgs() << "Spilling " << PrintReg(UnspilledCSGPR, RegInfo)
             << " to get a scratch register.\n");
       SavedRegs.set(UnspilledCSGPR);
@@ -1196,15 +1196,18 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
       // store the pair.
       if (produceCompactUnwindFrame(MF))
         SavedRegs.set(UnspilledCSGPRPaired);
-      ExtraCSSpill = true;
+      ExtraCSSpill = UnspilledCSGPRPaired;
       NumRegsSpilled = SavedRegs.count();
     }
 
     // If we didn't find an extra callee-saved register to spill, create
     // an emergency spill slot.
-    if (!ExtraCSSpill) {
-      const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
-      int FI = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), false);
+    if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
+      const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+      const TargetRegisterClass &RC = AArch64::GPR64RegClass;
+      unsigned Size = TRI->getSpillSize(RC);
+      unsigned Align = TRI->getSpillAlignment(RC);
+      int FI = MFI.CreateStackObject(Size, Align, false);
       RS->addScavengingFrameIndex(FI);
       DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
                    << " as the emergency spill slot.\n");
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4ddc95199d4c..a7c98fbb425f 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -91,6 +91,7 @@ using namespace llvm;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
+STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
 
 static cl::opt<bool>
 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
@@ -105,6 +106,12 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
     cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
     cl::init(false));
 
+static cl::opt<bool>
+EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
+                         cl::desc("Enable AArch64 logical imm instruction "
+                                  "optimization"),
+                         cl::init(true));
+
 /// Value type used for condition codes.
 static const MVT MVT_CC = MVT::i32;
 
@@ -787,6 +794,140 @@ EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
   return VT.changeVectorElementTypeToInteger();
 }
 
+static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
+                               const APInt &Demanded,
+                               TargetLowering::TargetLoweringOpt &TLO,
+                               unsigned NewOpc) {
+  uint64_t OldImm = Imm, NewImm, Enc;
+  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
+
+  // Return if the immediate is already all zeros, all ones, a bimm32 or a
+  // bimm64.
+  if (Imm == 0 || Imm == Mask ||
+      AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
+    return false;
+
+  unsigned EltSize = Size;
+  uint64_t DemandedBits = Demanded.getZExtValue();
+
+  // Clear bits that are not demanded.
+  Imm &= DemandedBits;
+
+  while (true) {
+    // The goal here is to set the non-demanded bits in a way that minimizes
+    // the number of switching between 0 and 1. In order to achieve this goal,
+    // we set the non-demanded bits to the value of the preceding demanded bits.
+    // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
+    // non-demanded bit), we copy bit0 (1) to the least significant 'x',
+    // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
+    // The final result is 0b11000011.
+    uint64_t NonDemandedBits = ~DemandedBits;
+    uint64_t InvertedImm = ~Imm & DemandedBits;
+    uint64_t RotatedImm =
+        ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
+        NonDemandedBits;
+    uint64_t Sum = RotatedImm + NonDemandedBits;
+    bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
+    uint64_t Ones = (Sum + Carry) & NonDemandedBits;
+    NewImm = (Imm | Ones) & Mask;
+
+    // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
+    // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
+    // we halve the element size and continue the search.
+    if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
+      break;
+
+    // We cannot shrink the element size any further if it is 2-bits.
+    if (EltSize == 2)
+      return false;
+
+    EltSize /= 2;
+    Mask >>= EltSize;
+    uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
+
+    // Return if there is mismatch in any of the demanded bits of Imm and Hi.
+    if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
+      return false;
+
+    // Merge the upper and lower halves of Imm and DemandedBits.
+    Imm |= Hi;
+    DemandedBits |= DemandedBitsHi;
+  }
+
+  ++NumOptimizedImms;
+
+  // Replicate the element across the register width.
+  while (EltSize < Size) {
+    NewImm |= NewImm << EltSize;
+    EltSize *= 2;
+  }
+
+  (void)OldImm;
+  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
+         "demanded bits should never be altered");
+  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
+
+  // Create the new constant immediate node.
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+
+  // If the new constant immediate is all-zeros or all-ones, let the target
+  // independent DAG combine optimize this node.
+  if (NewImm == 0 || NewImm == OrigMask)
+    return TLO.CombineTo(Op.getOperand(1), TLO.DAG.getConstant(NewImm, DL, VT));
+
+  // Otherwise, create a machine node so that target independent DAG combine
+  // doesn't undo this optimization.
+  Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
+  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
+  SDValue New(
+      TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
+
+  return TLO.CombineTo(Op, New);
+}
+
+bool AArch64TargetLowering::targetShrinkDemandedConstant(
+    SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
+  // Delay this optimization to as late as possible.
+  if (!TLO.LegalOps)
+    return false;
+
+  if (!EnableOptimizeLogicalImm)
+    return false;
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector())
+    return false;
+
+  unsigned Size = VT.getSizeInBits();
+  assert((Size == 32 || Size == 64) &&
+         "i32 or i64 is expected after legalization.");
+
+  // Exit early if we demand all bits.
+  if (Demanded.countPopulation() == Size)
+    return false;
+
+  unsigned NewOpc;
+  switch (Op.getOpcode()) {
+  default:
+    return false;
+  case ISD::AND:
+    NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
+    break;
+  case ISD::OR:
+    NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
+    break;
+  case ISD::XOR:
+    NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
+    break;
+  }
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!C)
+    return false;
+  uint64_t Imm = C->getZExtValue();
+  return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
+}
+
 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
 /// Mask are known to be either zero or one and return them in the
 /// KnownZero/KnownOne bitsets.
@@ -3418,11 +3559,75 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 //  Other Lowering Code
 //===----------------------------------------------------------------------===//
 
+SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
+                                             SelectionDAG &DAG,
+                                             unsigned Flag) const {
+  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag);
+}
+
+SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
+                                             SelectionDAG &DAG,
+                                             unsigned Flag) const {
+  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
+}
+
+SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
+                                             SelectionDAG &DAG,
+                                             unsigned Flag) const {
+  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
+                                   N->getOffset(), Flag);
+}
+
+SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
+                                             SelectionDAG &DAG,
+                                             unsigned Flag) const {
+  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
+}
+
+// (loadGOT sym)
+template <class NodeTy>
+SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG) const {
+  DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
+  SDLoc DL(N);
+  EVT Ty = getPointerTy(DAG.getDataLayout());
+  SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT);
+  // FIXME: Once remat is capable of dealing with instructions with register
+  // operands, expand this into two nodes instead of using a wrapper node.
+  return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
+}
+
+// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
+template <class NodeTy>
+SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG)
+  const {
+  DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
+  SDLoc DL(N);
+  EVT Ty = getPointerTy(DAG.getDataLayout());
+  const unsigned char MO_NC = AArch64II::MO_NC;
+  return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, Ty,
+        getTargetNode(N, Ty, DAG, AArch64II::MO_G3),
+        getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC),
+        getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC),
+        getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC));
+}
+
+// (addlow (adrp %hi(sym)) %lo(sym))
+template <class NodeTy>
+SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG) const {
+  DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
+  SDLoc DL(N);
+  EVT Ty = getPointerTy(DAG.getDataLayout());
+  SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE);
+  SDValue Lo = getTargetNode(N, Ty, DAG,
+                             AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
+  return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
+}
+
 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
                                                   SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  SDLoc DL(Op);
-  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
+  GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GN->getGlobal();
   unsigned char OpFlags =
       Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
@@ -3430,32 +3635,15 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
   assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
          "unexpected offset in global node");
 
-  // This also catched the large code model case for Darwin.
+  // This also catches the large code model case for Darwin.
   if ((OpFlags & AArch64II::MO_GOT) != 0) {
-    SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
-    // FIXME: Once remat is capable of dealing with instructions with register
-    // operands, expand this into two nodes instead of using a wrapper node.
-    return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+    return getGOT(GN, DAG);
   }
 
   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
-    const unsigned char MO_NC = AArch64II::MO_NC;
-    return DAG.getNode(
-        AArch64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3),
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+    return getAddrLarge(GN, DAG);
   } else {
-    // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
-    // the only correct model on Darwin.
-    SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
-                                            OpFlags | AArch64II::MO_PAGE);
-    unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
-    SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
-
-    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
-    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+    return getAddr(GN, DAG);
   }
 }
 
@@ -4232,90 +4420,37 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
   // Jump table entries as PC relative offsets. No additional tweaking
   // is necessary here. Just get the address of the jump table.
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-  EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  SDLoc DL(Op);
 
   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
       !Subtarget->isTargetMachO()) {
-    const unsigned char MO_NC = AArch64II::MO_NC;
-    return DAG.getNode(
-        AArch64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3),
-        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC),
-        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC),
-        DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
-                               AArch64II::MO_G0 | MO_NC));
+    return getAddrLarge(JT, DAG);
   }
-
-  SDValue Hi =
-      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE);
-  SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
-                                      AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
-  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
-  return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  return getAddr(JT, DAG);
 }
 
 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
                                                  SelectionDAG &DAG) const {
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
-  EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  SDLoc DL(Op);
 
   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
     // Use the GOT for the large code model on iOS.
     if (Subtarget->isTargetMachO()) {
-      SDValue GotAddr = DAG.getTargetConstantPool(
-          CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
-          AArch64II::MO_GOT);
-      return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+      return getGOT(CP, DAG);
     }
-
-    const unsigned char MO_NC = AArch64II::MO_NC;
-    return DAG.getNode(
-        AArch64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), AArch64II::MO_G3),
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), AArch64II::MO_G2 | MO_NC),
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), AArch64II::MO_G1 | MO_NC),
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), AArch64II::MO_G0 | MO_NC));
+    return getAddrLarge(CP, DAG);
   } else {
-    // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
-    // ELF, the only valid one on Darwin.
-    SDValue Hi =
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), AArch64II::MO_PAGE);
-    SDValue Lo = DAG.getTargetConstantPool(
-        CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
-        AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
-
-    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
-    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+    return getAddr(CP, DAG);
   }
 }
 
 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
-  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  SDLoc DL(Op);
+  BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
       !Subtarget->isTargetMachO()) {
-    const unsigned char MO_NC = AArch64II::MO_NC;
-    return DAG.getNode(
-        AArch64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3),
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+    return getAddrLarge(BA, DAG);
   } else {
-    SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE);
-    SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF |
-                                                             AArch64II::MO_NC);
-    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
-    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+    return getAddr(BA, DAG);
   }
 }
 
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index a023b4373835..6081b07479b9 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -255,6 +255,9 @@ public:
                                      const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
 
+  bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+                                    TargetLoweringOpt &TLO) const override;
+
   MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
 
   /// Returns true if the target allows unaligned memory accesses of the
@@ -508,6 +511,18 @@ private:
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                       SelectionDAG &DAG) const override;
 
+  SDValue getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
+                        unsigned Flag) const;
+  SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG,
+                        unsigned Flag) const;
+  SDValue getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG,
+                        unsigned Flag) const;
+  SDValue getTargetNode(BlockAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
+                        unsigned Flag) const;
+  template <class NodeTy> SDValue getGOT(NodeTy *N, SelectionDAG &DAG) const;
+  template <class NodeTy>
+  SDValue getAddrLarge(NodeTy *N, SelectionDAG &DAG) const;
+  template <class NodeTy> SDValue getAddr(NodeTy *N, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
index 867074c3c374..71826bec6b11 100644
--- a/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -14,6 +14,9 @@
 //===----------------------------------
 // Atomic fences
 //===----------------------------------
+let AddedComplexity = 15, Size = 0 in
+def CompilerBarrier : Pseudo<(outs), (ins i32imm:$ordering),
+                             [(atomic_fence imm:$ordering, 0)]>, Sched<[]>;
 def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>;
 def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
 
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 16be4432b160..c44daf306ea9 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -693,11 +693,11 @@ def addsub_shifted_imm32_neg : addsub_shifted_imm_neg<i32>;
 def addsub_shifted_imm64_neg : addsub_shifted_imm_neg<i64>;
 
 def gi_addsub_shifted_imm32 :
-    GIComplexOperandMatcher<s32, (ops i32imm, i32imm), "selectArithImmed">,
+    GIComplexOperandMatcher<s32, "selectArithImmed">,
     GIComplexPatternEquiv<addsub_shifted_imm32>;
 
 def gi_addsub_shifted_imm64 :
-    GIComplexOperandMatcher<s64, (ops i32imm, i32imm), "selectArithImmed">,
+    GIComplexOperandMatcher<s64, "selectArithImmed">,
     GIComplexPatternEquiv<addsub_shifted_imm64>;
 
 class neg_addsub_shifted_imm<ValueType Ty>
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index 41fc8eceab5c..cb268828455e 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2320,7 +2320,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
       PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
   unsigned Opc = 0;
   bool Offset = true;
-  switch (RC->getSize()) {
+  switch (TRI->getSpillSize(*RC)) {
   case 1:
     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
       Opc = AArch64::STRBui;
@@ -2424,7 +2424,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
 
   unsigned Opc = 0;
   bool Offset = true;
-  switch (RC->getSize()) {
+  switch (TRI->getSpillSize(*RC)) {
   case 1:
     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
       Opc = AArch64::LDRBui;
@@ -2649,7 +2649,8 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
     };
 
     if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
-      assert(getRegClass(DstReg)->getSize() == getRegClass(SrcReg)->getSize() &&
+      assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
+             TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
              "Mismatched register size in non subreg COPY");
       if (IsSpill)
         storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
@@ -2735,7 +2736,8 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
       }
 
       if (FillRC) {
-        assert(getRegClass(SrcReg)->getSize() == FillRC->getSize() &&
+        assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
+                   TRI.getRegSizeInBits(*FillRC) &&
                "Mismatched regclass size on folded subreg COPY");
         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
         MachineInstr &LoadMI = *--InsertPt;
@@ -3025,7 +3027,7 @@ bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   return false;
 }
 
-void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+void AArch64InstrInfo::getNoop(MCInst &NopInst) const {
   NopInst.setOpcode(AArch64::HINT);
   NopInst.addOperand(MCOperand::createImm(0));
 }
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index bacce441f6c5..4cd14db633b9 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -205,7 +205,7 @@ public:
                     const DebugLoc &DL, unsigned DstReg,
                     ArrayRef<MachineOperand> Cond, unsigned TrueReg,
                     unsigned FalseReg) const override;
-  void getNoopForMachoTarget(MCInst &NopInst) const override;
+  void getNoop(MCInst &NopInst) const override;
 
   /// analyzeCompare - For a comparison instruction, return the source registers
   /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 5e01b6cd2b46..b0e0e3eb4ba7 100644
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -41,12 +41,17 @@ using namespace llvm;
 
 namespace {
 
+#define GET_GLOBALISEL_PREDICATE_BITSET
+#include "AArch64GenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATE_BITSET
+
 class AArch64InstructionSelector : public InstructionSelector {
 public:
   AArch64InstructionSelector(const AArch64TargetMachine &TM,
                              const AArch64Subtarget &STI,
                              const AArch64RegisterBankInfo &RBI);
 
+  void beginFunction(const MachineFunction &MF) override;
   bool select(MachineInstr &I) const override;
 
 private:
@@ -62,14 +67,19 @@ private:
   bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
                            MachineRegisterInfo &MRI) const;
 
-  bool selectArithImmed(MachineOperand &Root, MachineOperand &Result1,
-                        MachineOperand &Result2) const;
+  ComplexRendererFn selectArithImmed(MachineOperand &Root) const;
 
   const AArch64TargetMachine &TM;
   const AArch64Subtarget &STI;
   const AArch64InstrInfo &TII;
   const AArch64RegisterInfo &TRI;
   const AArch64RegisterBankInfo &RBI;
+  bool ForCodeSize;
+
+  PredicateBitset AvailableFeatures;
+  PredicateBitset
+  computeAvailableFeatures(const MachineFunction *MF,
+                           const AArch64Subtarget *Subtarget) const;
 
 // We declare the temporaries used by selectImpl() in the class to minimize the
 // cost of constructing placeholder values.
@@ -88,7 +98,7 @@ AArch64InstructionSelector::AArch64InstructionSelector(
     const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
     const AArch64RegisterBankInfo &RBI)
     : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
-      TRI(*STI.getRegisterInfo()), RBI(RBI)
+      TRI(*STI.getRegisterInfo()), RBI(RBI), ForCodeSize(), AvailableFeatures()
 #define GET_GLOBALISEL_TEMPORARIES_INIT
 #include "AArch64GenGlobalISel.inc"
 #undef GET_GLOBALISEL_TEMPORARIES_INIT
@@ -567,6 +577,12 @@ bool AArch64InstructionSelector::selectVaStartDarwin(
   return true;
 }
 
+void AArch64InstructionSelector::beginFunction(
+    const MachineFunction &MF) {
+  ForCodeSize = MF.getFunction()->optForSize();
+  AvailableFeatures = computeAvailableFeatures(&MF, &STI);
+}
+
 bool AArch64InstructionSelector::select(MachineInstr &I) const {
   assert(I.getParent() && "Instruction should be in a basic block!");
   assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -1312,9 +1328,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
 /// SelectArithImmed - Select an immediate value that can be represented as
 /// a 12-bit value shifted left by either 0 or 12.  If so, return true with
 /// Val set to the 12-bit value and Shift set to the shifter operand.
-bool AArch64InstructionSelector::selectArithImmed(
-    MachineOperand &Root, MachineOperand &Result1,
-    MachineOperand &Result2) const {
+InstructionSelector::ComplexRendererFn
+AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
   MachineInstr &MI = *Root.getParent();
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
@@ -1333,13 +1348,13 @@ bool AArch64InstructionSelector::selectArithImmed(
   else if (Root.isReg()) {
     MachineInstr *Def = MRI.getVRegDef(Root.getReg());
     if (Def->getOpcode() != TargetOpcode::G_CONSTANT)
-      return false;
+      return nullptr;
     MachineOperand &Op1 = Def->getOperand(1);
     if (!Op1.isCImm() || Op1.getCImm()->getBitWidth() > 64)
-      return false;
+      return nullptr;
     Immed = Op1.getCImm()->getZExtValue();
   } else
-    return false;
+    return nullptr;
 
   unsigned ShiftAmt;
 
@@ -1349,14 +1364,10 @@ bool AArch64InstructionSelector::selectArithImmed(
     ShiftAmt = 12;
     Immed = Immed >> 12;
   } else
-    return false;
+    return nullptr;
 
   unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
-  Result1.ChangeToImmediate(Immed);
-  Result1.clearParent();
-  Result2.ChangeToImmediate(ShVal);
-  Result2.clearParent();
-  return true;
+  return [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed).addImm(ShVal); };
 }
 
 namespace llvm {
diff --git a/lib/Target/AArch64/AArch64SchedFalkor.td b/lib/Target/AArch64/AArch64SchedFalkor.td
index eec089087fe0..cf1c0b66db58 100644
--- a/lib/Target/AArch64/AArch64SchedFalkor.td
+++ b/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -79,14 +79,14 @@ def : WriteRes<WriteIM64,  [FalkorUnitX]> { let Latency = 5; }
 def : WriteRes<WriteBr,    [FalkorUnitB]> { let Latency = 1; }
 def : WriteRes<WriteBrReg, [FalkorUnitB]> { let Latency = 1; }
 def : WriteRes<WriteLD,    [FalkorUnitLD]> { let Latency = 3; }
-def : WriteRes<WriteST,    [FalkorUnitLD, FalkorUnitST, FalkorUnitSD]>
-      { let Latency = 3; let NumMicroOps = 3; }
+def : WriteRes<WriteST,    [FalkorUnitST, FalkorUnitSD]>
+      { let Latency = 0; let NumMicroOps = 2; }
 def : WriteRes<WriteSTP,   [FalkorUnitST, FalkorUnitSD]>
       { let Latency = 0; let NumMicroOps = 2; }
-def : WriteRes<WriteAdr,   [FalkorUnitXYZ]> { let Latency = 5; }
+def : WriteRes<WriteAdr,   [FalkorUnitXYZ]> { let Latency = 1; }
 def : WriteRes<WriteLDIdx, [FalkorUnitLD]> { let Latency = 5; }
-def : WriteRes<WriteSTIdx, [FalkorUnitLD, FalkorUnitST, FalkorUnitSD]>
-      { let Latency = 4; let NumMicroOps = 3; }
+def : WriteRes<WriteSTIdx, [FalkorUnitST, FalkorUnitSD]>
+      { let Latency = 0; let NumMicroOps = 2; }
 def : WriteRes<WriteF,     [FalkorUnitVXVY, FalkorUnitVXVY]>
       { let Latency = 3; let NumMicroOps = 2; }
 def : WriteRes<WriteFCmp,  [FalkorUnitVXVY]> { let Latency = 2; }
diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
index 4bd77d344488..8f8eeef8a6cf 100644
--- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -326,6 +326,10 @@ def : InstRW<[FalkorWr_5VXVY_7cyc],   (instregex "^TBX(v8i8Four|v16i8Four)$")>;
 
 // SIMD Store Instructions
 // -----------------------------------------------------------------------------
+def : InstRW<[WriteVST],              (instregex "^STP(D|S)(i)$")>;
+def : InstRW<[WriteVST, WriteAdr],    (instregex "^STP(D|S)(post|pre)$")>;
+def : InstRW<[FalkorWr_2XYZ_2ST_2VSD_0cyc], (instregex "^STRQro(W|X)$")>;
+
 def : InstRW<[WriteVST],                                                        (instregex "^ST1(One(v8b|v4h|v2s|v1d)(_POST)?|(i8|i16|i32|i64)(_POST)?|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>;
 def : InstRW<[WriteVST],                                                        (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))$")>;
 def : InstRW<[WriteVST, WriteAdr],                                              (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>;
@@ -421,6 +425,7 @@ def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(H|S|D)r$")>;
 
 def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>;
 def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)Drrr$")>;
+
 // FP Miscellaneous Instructions
 // -----------------------------------------------------------------------------
 def : InstRW<[FalkorWr_FMOV],         (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>;
@@ -433,7 +438,6 @@ def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)CVTF(v1i16|v1i32|v2i32|v
 
 def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(S|U)CVTF(v2i64|v4i32|v8i16|v2f64|v4f32|v8f16)(_shift)?")>;
 
-
 // Load Instructions
 // -----------------------------------------------------------------------------
 def : InstRW<[FalkorWr_1ST_0cyc],     (instrs PRFMui, PRFMl)>;
@@ -461,6 +465,7 @@ def : InstRW<[FalkorWr_1LD_4cyc, WriteAdr],(instregex "^LDRS(BW|BX|HW|HX|W)(post
 def : InstRW<[WriteLD, WriteLDHi, WriteAdr],(instregex "^LDP(W|X)(post|pre)$")>;
 def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi],(instrs LDPSWi)>;
 def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi, WriteAdr],(instregex "^LDPSW(post|pre)$")>;
+
 // Miscellaneous Data-Processing Instructions
 // -----------------------------------------------------------------------------
 def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^(S|U)?BFM(W|X)ri$")>;
@@ -502,28 +507,30 @@ def : InstRW<[FalkorWr_1LD_1Z_3cyc],  (instrs DRPS)>;
 
 def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs MSR)>;
 def : InstRW<[WriteVST],              (instrs STNPDi, STNPSi)>;
-def : InstRW<[WriteSTP],               (instrs STNPWi, STNPXi)>;
+def : InstRW<[WriteSTP],              (instrs STNPWi, STNPXi)>;
 def : InstRW<[FalkorWr_2LD_1Z_3cyc],  (instrs ERET)>;
 
-def : InstRW<[WriteST],               (instregex "^LDC.*$")>;
-def : InstRW<[WriteST],               (instregex "^STLR(B|H|W|X)$")>;
-def : InstRW<[WriteST],               (instregex "^STXP(W|X)$")>;
-def : InstRW<[WriteST],               (instregex "^STXR(B|H|W|X)$")>;
+def : InstRW<[FalkorWr_1ST_1SD_1LD_3cyc], (instregex "^LDC.*$")>;
+def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STLR(B|H|W|X)$")>;
+def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STXP(W|X)$")>;
+def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STXR(B|H|W|X)$")>;
 
-def : InstRW<[WriteSTX],              (instregex "^STLXP(W|X)$")>;
-def : InstRW<[WriteSTX],              (instregex "^STLXR(B|H|W|X)$")>;
+def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXP(W|X)$")>;
+def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXR(B|H|W|X)$")>;
 def : InstRW<[WriteVST, WriteVST],    (instrs STNPQi)>;
 
 // Store Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[WriteVST],              (instregex "^STP(D|S)(i|post|pre)$")>;
-def : InstRW<[WriteST],               (instregex "^STP(W|X)(i|post|pre)$")>;
+def : InstRW<[WriteST],               (instregex "^STP(W|X)i$")>;
+def : InstRW<[WriteST, WriteAdr],     (instregex "^STP(W|X)(post|pre)$")>;
 def : InstRW<[WriteST],               (instregex "^STR(Q|D|S|BB|HH)ui$")>;
 def : InstRW<[WriteST],               (instregex "^STUR(Q|D|S|BB|HH)i$")>;
-def : InstRW<[WriteST],               (instregex "^STR(B|H|W|X)(post|pre|ui)$")>;
+def : InstRW<[WriteST],               (instregex "^STR(B|H|W|X)ui$")>;
+def : InstRW<[WriteST, WriteAdr],     (instregex "^STR(B|H|W|X)(post|pre)$")>;
 def : InstRW<[WriteST],               (instregex "^STTR(B|H|W|X)i$")>;
 def : InstRW<[WriteST],               (instregex "^STUR(B|H|W|X)i$")>;
 
 def : InstRW<[WriteST, WriteAdr],     (instregex "^STR(B|H|W|X)ro(W|X)$")>;
 
-def : InstRW<[WriteVST, WriteVST],    (instregex "^STPQ(i|post|pre)$")>;
+def : InstRW<[WriteVST, WriteVST],           (instregex "^STPQi$")>;
+def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^STPQ(post|pre)$")>;
diff --git a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
index 9cdb4be4246b..e64b2c441a19 100644
--- a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
+++ b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
@@ -28,7 +28,6 @@
 //===----------------------------------------------------------------------===//
 // Define 1 micro-op types
 
-
 def FalkorWr_1X_2cyc    : SchedWriteRes<[FalkorUnitX]>   { let Latency = 2; }
 def FalkorWr_1X_4cyc    : SchedWriteRes<[FalkorUnitX]>   { let Latency = 4; }
 def FalkorWr_1X_5cyc    : SchedWriteRes<[FalkorUnitX]>   { let Latency = 5; }
@@ -175,18 +174,33 @@ def FalkorWr_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitSD, FalkorUnitST]> {
 //===----------------------------------------------------------------------===//
 // Define 3 micro-op types
 
+def FalkorWr_1ST_1SD_1LD_0cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD,
+                                               FalkorUnitLD]> {
+  let Latency = 0;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_1ST_1SD_1LD_3cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD,
+                                               FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+
 def FalkorWr_3VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
   let Latency = 3;
   let NumMicroOps = 3;
 }
+
 def FalkorWr_3VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
   let Latency = 4;
   let NumMicroOps = 3;
 }
+
 def FalkorWr_3VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
   let Latency = 5;
   let NumMicroOps = 3;
 }
+
 def FalkorWr_3VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
   let Latency = 6;
   let NumMicroOps = 3;
@@ -196,10 +210,12 @@ def FalkorWr_1LD_2VXVY_4cyc  : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
   let Latency = 4;
   let NumMicroOps = 3;
 }
+
 def FalkorWr_2LD_1none_3cyc  : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
   let Latency = 3;
   let NumMicroOps = 3;
 }
+
 def FalkorWr_3LD_3cyc        : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
                                               FalkorUnitLD]> {
   let Latency = 3;
@@ -259,6 +275,12 @@ def FalkorWr_2LD_2none_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
   let NumMicroOps = 4;
 }
 
+def FalkorWr_2LD_1ST_1SD_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitST,
+                                              FalkorUnitSD, FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+
 //===----------------------------------------------------------------------===//
 // Define 5 micro-op types
 
@@ -289,6 +311,13 @@ def FalkorWr_2LD_2VXVY_2none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
   let NumMicroOps = 6;
 }
 
+def FalkorWr_2XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST,
+                                                FalkorUnitVSD, FalkorUnitXYZ,
+                                                FalkorUnitST, FalkorUnitVSD]> {
+  let Latency = 0;
+  let NumMicroOps = 6;
+}
+
 //===----------------------------------------------------------------------===//
 // Define 8 micro-op types
 
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index d7bbc2bcd22c..4dbcc9581a84 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -2473,16 +2473,14 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
 
-  auto DB = AArch64DB::lookupDBByName(Tok.getString());
-  if (!DB) {
-    TokError("invalid barrier option name");
-    return MatchOperand_ParseFail;
-  }
-
   // The only valid named option for ISB is 'sy'
-  if (Mnemonic == "isb" && DB->Encoding != AArch64DB::sy) {
+  auto DB = AArch64DB::lookupDBByName(Tok.getString());
+  if (Mnemonic == "isb" && (!DB || DB->Encoding != AArch64DB::sy)) {
     TokError("'sy' or #imm operand expected");
     return MatchOperand_ParseFail;
+  } else if (!DB) {
+    TokError("invalid barrier option name");
+    return MatchOperand_ParseFail;
   }
 
   Operands.push_back(AArch64Operand::CreateBarrier(
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index 41ae70f85e58..fc89657bffd3 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -275,6 +276,12 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     }
   }
 
+  if (Opcode == AArch64::CompilerBarrier) {
+    O << '\t' << MAI.getCommentString() << " COMPILER BARRIER";
+    printAnnotation(O, Annot);
+    return;
+  }
+
   if (!printAliasInstr(MI, STI, O))
     printInstruction(MI, STI, O);
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index 62dfa59483eb..33698d2b8c38 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -565,6 +565,9 @@ void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     MCFixupKind Fixup = MCFixupKind(AArch64::fixup_aarch64_tlsdesc_call);
     Fixups.push_back(MCFixup::create(0, MI.getOperand(0).getExpr(), Fixup));
     return;
+  } else if (MI.getOpcode() == AArch64::CompilerBarrier) {
+    // This just prevents the compiler from reordering accesses, no actual code.
+    return;
   }
 
   uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 2c7a2d8962d0..0f331486d0f8 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -406,7 +406,8 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
    FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
-   FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode
+   FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
+   FeatureFastFMAF32
   ]
 >;
 
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 318de7f2e3d2..f5110857da84 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -116,8 +116,11 @@ private:
   bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
                          SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
                          SDValue &SLC) const;
-  bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
-                          SDValue &SOffset, SDValue &ImmOffset) const;
+  bool SelectMUBUFScratchOffen(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
+                               SDValue &SOffset, SDValue &ImmOffset) const;
+  bool SelectMUBUFScratchOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
+                                SDValue &Offset) const;
+
   bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
                          SDValue &Offset, SDValue &GLC, SDValue &SLC,
                          SDValue &TFE) const;
@@ -150,14 +153,12 @@ private:
 
   bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
-  bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
   bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
                        SDValue &Clamp, SDValue &Omod) const;
   bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
                          SDValue &Clamp, SDValue &Omod) const;
 
-  bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods,
-                            SDValue &Omod) const;
   bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods,
                                  SDValue &Clamp,
                                  SDValue &Omod) const;
@@ -953,8 +954,12 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
   return true;
 }
 
+static bool isLegalMUBUFImmOffset(unsigned Imm) {
+  return isUInt<12>(Imm);
+}
+
 static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
-  return isUInt<12>(Imm->getZExtValue());
+  return isLegalMUBUFImmOffset(Imm->getZExtValue());
 }
 
 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
@@ -1076,9 +1081,9 @@ SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
   return N;
 }
 
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
-                                            SDValue &VAddr, SDValue &SOffset,
-                                            SDValue &ImmOffset) const {
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDValue Addr, SDValue &Rsrc,
+                                                 SDValue &VAddr, SDValue &SOffset,
+                                                 SDValue &ImmOffset) const {
 
   SDLoc DL(Addr);
   MachineFunction &MF = CurDAG->getMachineFunction();
@@ -1087,8 +1092,22 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
   Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
   SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);
 
-  // (add n0, c1)
+  if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+    unsigned Imm = CAddr->getZExtValue();
+    assert(!isLegalMUBUFImmOffset(Imm) &&
+           "should have been selected by other pattern");
+
+    SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
+    MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+                                                        DL, MVT::i32, HighBits);
+    VAddr = SDValue(MovHighBits, 0);
+    ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
+    return true;
+  }
+
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    // (add n0, c1)
+
     SDValue N0 = Addr.getOperand(0);
     SDValue N1 = Addr.getOperand(1);
 
@@ -1107,6 +1126,24 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
   return true;
 }
 
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDValue Addr,
+                                                  SDValue &SRsrc,
+                                                  SDValue &SOffset,
+                                                  SDValue &Offset) const {
+  ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr);
+  if (!CAddr || !isLegalMUBUFImmOffset(CAddr))
+    return false;
+
+  SDLoc DL(Addr);
+  MachineFunction &MF = CurDAG->getMachineFunction();
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+  SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
+  SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);
+  Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
+  return true;
+}
+
 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
                                            SDValue &SOffset, SDValue &Offset,
                                            SDValue &GLC, SDValue &SLC,
@@ -1628,38 +1665,20 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
   return isNoNanSrc(Src);
 }
 
-bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src,
-                                         SDValue &SrcMods) const {
-  bool Res = SelectVOP3Mods(In, Src, SrcMods);
-  return Res && cast<ConstantSDNode>(SrcMods)->isNullValue();
+bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
+  if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
+    return false;
+
+  Src = In;
+  return true;
 }
 
 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
                                          SDValue &SrcMods, SDValue &Clamp,
                                          SDValue &Omod) const {
   SDLoc DL(In);
-  // FIXME: Handle Clamp and Omod
-  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32);
-  Omod = CurDAG->getTargetConstant(0, DL, MVT::i32);
-
-  return SelectVOP3Mods(In, Src, SrcMods);
-}
-
-bool AMDGPUDAGToDAGISel::SelectVOP3NoMods0(SDValue In, SDValue &Src,
-                                           SDValue &SrcMods, SDValue &Clamp,
-                                           SDValue &Omod) const {
-  bool Res = SelectVOP3Mods0(In, Src, SrcMods, Clamp, Omod);
-
-  return Res && cast<ConstantSDNode>(SrcMods)->isNullValue() &&
-                cast<ConstantSDNode>(Clamp)->isNullValue() &&
-                cast<ConstantSDNode>(Omod)->isNullValue();
-}
-
-bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src,
-                                              SDValue &SrcMods,
-                                              SDValue &Omod) const {
-  // FIXME: Handle Omod
-  Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
+  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
 
   return SelectVOP3Mods(In, Src, SrcMods);
 }
@@ -1677,9 +1696,8 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
   Src = In;
 
   SDLoc DL(In);
-  // FIXME: Handle Clamp and Omod
-  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32);
-  Omod = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
 
   return true;
 }
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index c0f336e082bd..e21775e61dd4 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2315,12 +2315,13 @@ static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
 
   SelectionDAG &DAG = DCI.DAG;
   SDValue Op = Node24->getOperand(OpIdx);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = Op.getValueType();
 
   APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
   APInt KnownZero, KnownOne;
   TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
-  if (TLO.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI))
+  if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO))
     return true;
 
   return false;
@@ -3361,7 +3362,7 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                             !DCI.isBeforeLegalizeOps());
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-      if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
+      if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
           TLI.SimplifyDemandedBits(BitsFrom, Demanded,
                                    KnownZero, KnownOne, TLO)) {
         DCI.CommitTargetLoweringOpt(TLO);
@@ -3436,6 +3437,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(ELSE)
   NODE_NAME_CASE(LOOP)
   NODE_NAME_CASE(CALL)
+  NODE_NAME_CASE(TRAP)
   NODE_NAME_CASE(RET_FLAG)
   NODE_NAME_CASE(RETURN_TO_EPILOG)
   NODE_NAME_CASE(ENDPGM)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index d6aa0ba92bf7..13cbfe267932 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -231,6 +231,10 @@ public:
   AMDGPUAS getAMDGPUAS() const {
     return AMDGPUASI;
   }
+
+  MVT getFenceOperandTy(const DataLayout &DL) const override {
+    return MVT::i32;
+  }
 };
 
 namespace AMDGPUISD {
@@ -244,6 +248,7 @@ enum NodeType : unsigned {
 
   // Function call.
   CALL,
+  TRAP,
 
   // Masked control flow nodes.
   IF,
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 56f060984f08..c1706d12a2ea 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -78,6 +78,11 @@ def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>;
 def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>;
 def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>;
 
+def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
+  SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>,
+    [SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue]
+>;
+
 def AMDGPUconstdata_ptr : SDNode<
   "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>,
                                                      SDTCisVT<0, iPTR>]>
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index b8d681298dee..4e688ab0b105 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -50,6 +50,16 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
 def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
 def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
 
+def u16ImmTarget : AsmOperandClass {
+  let Name = "U16Imm";
+  let RenderMethod = "addImmOperands";
+}
+
+def s16ImmTarget : AsmOperandClass {
+  let Name = "S16Imm";
+  let RenderMethod = "addImmOperands";
+}
+
 let OperandType = "OPERAND_IMMEDIATE" in {
 
 def u32imm : Operand<i32> {
@@ -58,6 +68,12 @@ def u32imm : Operand<i32> {
 
 def u16imm : Operand<i16> {
   let PrintMethod = "printU16ImmOperand";
+  let ParserMatchClass = u16ImmTarget;
+}
+
+def s16imm : Operand<i16> {
+  let PrintMethod = "printU16ImmOperand";
+  let ParserMatchClass = s16ImmTarget;
 }
 
 def u8imm : Operand<i8> {
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 14ee1c81f8fa..da247fea7de6 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -225,6 +225,12 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       return;
     }
 
+    if (MI->getOpcode() == AMDGPU::SI_MASKED_UNREACHABLE) {
+      if (isVerbose())
+        OutStreamer->emitRawComment(" divergent unreachable");
+      return;
+    }
+
     MCInst TmpInst;
     MCInstLowering.lower(MI, TmpInst);
     EmitToStreamer(*OutStreamer, TmpInst);
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 961f7186f373..70c848f3c7bd 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -479,6 +479,8 @@ public:
   bool isSMRDLiteralOffset() const;
   bool isDPPCtrl() const;
   bool isGPRIdxMode() const;
+  bool isS16Imm() const;
+  bool isU16Imm() const;
 
   StringRef getExpressionAsToken() const {
     assert(isExpr());
@@ -2836,6 +2838,28 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
 // s_waitcnt
 //===----------------------------------------------------------------------===//
 
+static bool
+encodeCnt(
+  const AMDGPU::IsaInfo::IsaVersion ISA,
+  int64_t &IntVal,
+  int64_t CntVal,
+  bool Saturate,
+  unsigned (*encode)(const IsaInfo::IsaVersion &Version, unsigned, unsigned),
+  unsigned (*decode)(const IsaInfo::IsaVersion &Version, unsigned))
+{
+  bool Failed = false;
+
+  IntVal = encode(ISA, IntVal, CntVal);
+  if (CntVal != decode(ISA, IntVal)) {
+    if (Saturate) {
+      IntVal = encode(ISA, IntVal, -1);
+    } else {
+      Failed = true;
+    }
+  }
+  return Failed;
+}
+
 bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
   StringRef CntName = Parser.getTok().getString();
   int64_t CntVal;
@@ -2851,25 +2875,35 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
   if (getParser().parseAbsoluteExpression(CntVal))
     return true;
 
-  if (getLexer().isNot(AsmToken::RParen))
-    return true;
-
-  Parser.Lex();
-  if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma))
-    Parser.Lex();
-
   AMDGPU::IsaInfo::IsaVersion ISA =
       AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
-  if (CntName == "vmcnt")
-    IntVal = encodeVmcnt(ISA, IntVal, CntVal);
-  else if (CntName == "expcnt")
-    IntVal = encodeExpcnt(ISA, IntVal, CntVal);
-  else if (CntName == "lgkmcnt")
-    IntVal = encodeLgkmcnt(ISA, IntVal, CntVal);
-  else
-    return true;
 
-  return false;
+  bool Failed = true;
+  bool Sat = CntName.endswith("_sat");
+
+  if (CntName == "vmcnt" || CntName == "vmcnt_sat") {
+    Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeVmcnt, decodeVmcnt);
+  } else if (CntName == "expcnt" || CntName == "expcnt_sat") {
+    Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeExpcnt, decodeExpcnt);
+  } else if (CntName == "lgkmcnt" || CntName == "lgkmcnt_sat") {
+    Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeLgkmcnt, decodeLgkmcnt);
+  }
+
+  // To improve diagnostics, do not skip delimiters on errors
+  if (!Failed) {
+    if (getLexer().isNot(AsmToken::RParen)) {
+      return true;
+    }
+    Parser.Lex();
+    if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) {
+      const AsmToken NextToken = getLexer().peekTok();
+      if (NextToken.is(AsmToken::Identifier)) {
+        Parser.Lex();
+      }
+    }
+  }
+
+  return Failed;
 }
 
 OperandMatchResultTy
@@ -3858,6 +3892,14 @@ bool AMDGPUOperand::isGPRIdxMode() const {
   return isImm() && isUInt<4>(getImm());
 }
 
+bool AMDGPUOperand::isS16Imm() const {
+  return isImm() && (isInt<16>(getImm()) || isUInt<16>(getImm()));
+}
+
+bool AMDGPUOperand::isU16Imm() const {
+  return isImm() && isUInt<16>(getImm());
+}
+
 OperandMatchResultTy
 AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index a6609f0725ab..89eddb9ce961 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -11,7 +11,9 @@ def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
 def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">;
 def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
 
-def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
+def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen">;
+def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [], 20>;
+
 def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
 def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
 def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
@@ -958,21 +960,30 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, mubuf_az_extloadi8>;
 
 } // End Predicates = [Has16BitInsts]
 
-class MUBUFScratchLoadPat <MUBUF_Pseudo Instr, ValueType vt, PatFrag ld> : Pat <
-  (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
-                        i32:$soffset, u16imm:$offset))),
-  (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
->;
+multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen,
+                                MUBUF_Pseudo InstrOffset,
+                                ValueType vt, PatFrag ld> {
+  def : Pat <
+    (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
+                               i32:$soffset, u16imm:$offset))),
+    (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
+  >;
+
+  def : Pat <
+    (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
+    (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0)
+  >;
+}
 
-def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i16, sextloadi8_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i16, extloadi8_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i32, sextloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, extloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, BUFFER_LOAD_SSHORT_OFFSET, i32, sextloadi16_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, extloadi16_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i32, load_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
 
 // BUFFER_LOAD_DWORD*, addr64=0
 multiclass MUBUF_Load_Dword <ValueType vt,
@@ -1054,19 +1065,29 @@ multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
 defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_global>;
 defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, global_store>;
 
-class MUBUFScratchStorePat <MUBUF_Pseudo Instr, ValueType vt, PatFrag st> : Pat <
-  (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset,
-                               u16imm:$offset)),
-  (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
->;
+multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen,
+                                 MUBUF_Pseudo InstrOffset,
+                                 ValueType vt, PatFrag st> {
+  def : Pat <
+    (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
+                                      i32:$soffset, u16imm:$offset)),
+    (InstrOffen $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
+  >;
+
+  def : Pat <
+    (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset,
+                                       u16imm:$offset)),
+    (InstrOffset $value, $srsrc, $soffset, $offset, 0, 0, 0)
+  >;
+}
 
-def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i16, truncstorei8_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i16, store_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i32, truncstorei8_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i32, truncstorei16_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i16, store_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, i32, store_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private>;
 
 //===----------------------------------------------------------------------===//
 // MTBUF Patterns
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index 4ecfa118fb27..bf16a8216001 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -83,8 +83,8 @@ unsigned GCNRegPressure::getRegKind(unsigned Reg,
   const auto RC = MRI.getRegClass(Reg);
   auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
   return STI->isSGPRClass(RC) ?
-    (RC->getSize() == 4 ? SGPR32 : SGPR_TUPLE) :
-    (RC->getSize() == 4 ? VGPR32 : VGPR_TUPLE);
+    (STI->getRegSizeInBits(*RC) == 32 ? SGPR32 : SGPR_TUPLE) :
+    (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE);
 }
 
 void GCNRegPressure::inc(unsigned Reg,
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
index 29a6ab9fbe93..647017d5061d 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
@@ -286,20 +286,20 @@ ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual,
     return ValueKind::Pipe;
 
   return StringSwitch<ValueKind>(BaseTypeName)
+             .Case("image1d_t", ValueKind::Image)
+             .Case("image1d_array_t", ValueKind::Image)
+             .Case("image1d_buffer_t", ValueKind::Image)
+             .Case("image2d_t", ValueKind::Image)
+             .Case("image2d_array_t", ValueKind::Image)
+             .Case("image2d_array_depth_t", ValueKind::Image)
+             .Case("image2d_array_msaa_t", ValueKind::Image)
+             .Case("image2d_array_msaa_depth_t", ValueKind::Image)
+             .Case("image2d_depth_t", ValueKind::Image)
+             .Case("image2d_msaa_t", ValueKind::Image)
+             .Case("image2d_msaa_depth_t", ValueKind::Image)
+             .Case("image3d_t", ValueKind::Image)
              .Case("sampler_t", ValueKind::Sampler)
              .Case("queue_t", ValueKind::Queue)
-             .Cases("image1d_t",
-                    "image1d_array_t",
-                    "image1d_buffer_t",
-                    "image2d_t" ,
-                    "image2d_array_t",
-                    "image2d_array_depth_t",
-                    "image2d_array_msaa_t"
-                    "image2d_array_msaa_depth_t"
-                    "image2d_depth_t",
-                    "image2d_msaa_t",
-                    "image2d_msaa_depth_t",
-                    "image3d_t", ValueKind::Image)
              .Default(isa<PointerType>(Ty) ?
                           (Ty->getPointerAddressSpace() ==
                            AMDGPUASI.LOCAL_ADDRESS ?
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 6c61fb1f2d6b..2364e7b7b5fb 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -15,6 +15,7 @@ using namespace llvm;
 
 AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
   CodePointerSize = (TT.getArch() == Triple::amdgcn) ? 8 : 4;
+  StackGrowsUp = true;
   HasSingleParameterDotFile = false;
   //===------------------------------------------------------------------===//
   MinInstAlignment = 4;
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index f9d258f44a62..b0f0bf04a891 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -81,6 +81,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "si-fix-sgpr-copies"
 
+static cl::opt<bool> EnableM0Merge(
+  "amdgpu-enable-merge-m0",
+  cl::desc("Merge and hoist M0 initializations"),
+  cl::init(false));
+
 namespace {
 
 class SIFixSGPRCopies : public MachineFunctionPass {
@@ -108,7 +113,7 @@ public:
 
 INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
                      "SI Fix SGPR copies", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
                      "SI Fix SGPR copies", false, false)
 
@@ -332,27 +337,186 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
   return true;
 }
 
-static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
-                               const TargetRegisterInfo *TRI) {
-  DenseSet<MachineBasicBlock*> Visited;
+template <class UnaryPredicate>
+bool searchPredecessors(const MachineBasicBlock *MBB,
+                        const MachineBasicBlock *CutOff,
+                        UnaryPredicate Predicate) {
+
+  if (MBB == CutOff)
+    return false;
+
+  DenseSet<const MachineBasicBlock*> Visited;
   SmallVector<MachineBasicBlock*, 4> Worklist(MBB->pred_begin(), 
                                               MBB->pred_end());
 
   while (!Worklist.empty()) {
-    MachineBasicBlock *mbb = Worklist.back();
-    Worklist.pop_back();
+    MachineBasicBlock *MBB = Worklist.pop_back_val();
 
-    if (!Visited.insert(mbb).second)
+    if (!Visited.insert(MBB).second)
       continue;
-    if (hasTerminatorThatModifiesExec(*mbb, *TRI))
+    if (MBB == CutOff)
+      continue;
+    if (Predicate(MBB))
       return true;
 
-    Worklist.insert(Worklist.end(), mbb->pred_begin(), mbb->pred_end());
+    Worklist.append(MBB->pred_begin(), MBB->pred_end());
   }
 
   return false;
 }
 
+static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
+                                        const TargetRegisterInfo *TRI) {
+  return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
+           return hasTerminatorThatModifiesExec(*MBB, *TRI); });
+}
+
+// Checks if there is potential path From instruction To instruction.
+// If CutOff is specified and it sits in between of that path we ignore
+// a higher portion of the path and report it is not reachable.
+static bool isReachable(const MachineInstr *From,
+                        const MachineInstr *To,
+                        const MachineBasicBlock *CutOff,
+                        MachineDominatorTree &MDT) {
+  // If either From block dominates To block or instructions are in the same
+  // block and From is higher.
+  if (MDT.dominates(From, To))
+    return true;
+
+  const MachineBasicBlock *MBBFrom = From->getParent();
+  const MachineBasicBlock *MBBTo = To->getParent();
+  if (MBBFrom == MBBTo)
+    return false;
+
+  // Instructions are in different blocks, do predecessor search.
+  // We should almost never get here since we do not usually produce M0 stores
+  // other than -1.
+  return searchPredecessors(MBBTo, CutOff, [MBBFrom]
+           (const MachineBasicBlock *MBB) { return MBB == MBBFrom; });
+}
+
+// Hoist and merge identical SGPR initializations into a common predecessor.
+// This is intended to combine M0 initializations, but can work with any
+// SGPR. A VGPR cannot be processed since we cannot guarantee vector
+// executioon.
+static bool hoistAndMergeSGPRInits(unsigned Reg,
+                                   const MachineRegisterInfo &MRI,
+                                   MachineDominatorTree &MDT) {
+  // List of inits by immediate value.
+  typedef std::map<unsigned, std::list<MachineInstr*>> InitListMap;
+  InitListMap Inits;
+  // List of clobbering instructions.
+  SmallVector<MachineInstr*, 8> Clobbers;
+  bool Changed = false;
+
+  for (auto &MI : MRI.def_instructions(Reg)) {
+    MachineOperand *Imm = nullptr;
+    for (auto &MO: MI.operands()) {
+      if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) ||
+          (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) {
+        Imm = nullptr;
+        break;
+      } else if (MO.isImm())
+        Imm = &MO;
+    }
+    if (Imm)
+      Inits[Imm->getImm()].push_front(&MI);
+    else
+      Clobbers.push_back(&MI);
+  }
+
+  for (auto &Init : Inits) {
+    auto &Defs = Init.second;
+
+    for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) {
+      MachineInstr *MI1 = *I1;
+
+      for (auto I2 = std::next(I1); I2 != E; ) {
+        MachineInstr *MI2 = *I2;
+
+        // Check any possible interference
+        auto intereferes = [&](MachineBasicBlock::iterator From,
+                               MachineBasicBlock::iterator To) -> bool {
+
+          assert(MDT.dominates(&*To, &*From));
+
+          auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool {
+            const MachineBasicBlock *MBBFrom = From->getParent();
+            const MachineBasicBlock *MBBTo = To->getParent();
+            bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT);
+            bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT);
+            if (!MayClobberFrom && !MayClobberTo)
+              return false;
+            if ((MayClobberFrom && !MayClobberTo) ||
+                (!MayClobberFrom && MayClobberTo))
+              return true;
+            // Both can clobber, this is not an interference only if both are
+            // dominated by Clobber and belong to the same block or if Clobber
+            // properly dominates To, given that To >> From, so it dominates
+            // both and located in a common dominator.
+            return !((MBBFrom == MBBTo &&
+                      MDT.dominates(Clobber, &*From) &&
+                      MDT.dominates(Clobber, &*To)) ||
+                     MDT.properlyDominates(Clobber->getParent(), MBBTo));
+          };
+
+          return (any_of(Clobbers, interferes)) ||
+                 (any_of(Inits, [&](InitListMap::value_type &C) {
+                    return C.first != Init.first && any_of(C.second, interferes);
+                  }));
+        };
+
+        if (MDT.dominates(MI1, MI2)) {
+          if (!intereferes(MI2, MI1)) {
+            DEBUG(dbgs() << "Erasing from BB#" << MI2->getParent()->getNumber()
+                         << " " << *MI2);
+            MI2->eraseFromParent();
+            Defs.erase(I2++);
+            Changed = true;
+            continue;
+          }
+        } else if (MDT.dominates(MI2, MI1)) {
+          if (!intereferes(MI1, MI2)) {
+            DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber()
+                         << " " << *MI1);
+            MI1->eraseFromParent();
+            Defs.erase(I1++);
+            Changed = true;
+            break;
+          }
+        } else {
+          auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(),
+                                                     MI2->getParent());
+          if (!MBB) {
+            ++I2;
+            continue;
+          }
+
+          MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
+          if (!intereferes(MI1, I) && !intereferes(MI2, I)) {
+            DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber()
+                         << " " << *MI1 << "and moving from BB#"
+                         << MI2->getParent()->getNumber() << " to BB#"
+                         << I->getParent()->getNumber() << " " << *MI2);
+            I->getParent()->splice(I, MI2->getParent(), MI2);
+            MI1->eraseFromParent();
+            Defs.erase(I1++);
+            Changed = true;
+            break;
+          }
+        }
+        ++I2;
+      }
+      ++I1;
+    }
+  }
+
+  if (Changed)
+    MRI.clearKillFlags(Reg);
+
+  return Changed;
+}
+
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -485,5 +649,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
+    hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT);
+
   return true;
 }
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index abe6af9a6d3f..86e3b37b09e9 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -101,10 +101,12 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
   const SIRegisterInfo *TRI,
   SIMachineFunctionInfo *MFI,
   MachineFunction &MF) const {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // We need to insert initialization of the scratch resource descriptor.
   unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
-  if (ScratchRsrcReg == AMDGPU::NoRegister)
+  if (ScratchRsrcReg == AMDGPU::NoRegister ||
+      !MRI.isPhysRegUsed(ScratchRsrcReg))
     return AMDGPU::NoRegister;
 
   if (ST.hasSGPRInitBug() ||
@@ -122,8 +124,6 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
 
   // We find the resource first because it has an alignment requirement.
 
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-
   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
   ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
@@ -143,24 +143,34 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
   return ScratchRsrcReg;
 }
 
-unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
+// Shift down registers reserved for the scratch wave offset and stack pointer
+// SGPRs.
+std::pair<unsigned, unsigned>
+SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
   const SISubtarget &ST,
   const SIInstrInfo *TII,
   const SIRegisterInfo *TRI,
   SIMachineFunctionInfo *MFI,
   MachineFunction &MF) const {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
-  if (ST.hasSGPRInitBug() ||
-      ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF))
-    return ScratchWaveOffsetReg;
 
-  unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
+  // No replacement necessary.
+  if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
+      !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) {
+    assert(MFI->getStackPtrOffsetReg() == AMDGPU::NoRegister);
+    return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister);
+  }
+
+  unsigned SPReg = MFI->getStackPtrOffsetReg();
+  if (ST.hasSGPRInitBug())
+    return std::make_pair(ScratchWaveOffsetReg, SPReg);
+
   unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
 
   ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
   if (NumPreloaded > AllSGPRs.size())
-    return ScratchWaveOffsetReg;
+    return std::make_pair(ScratchWaveOffsetReg, SPReg);
 
   AllSGPRs = AllSGPRs.slice(NumPreloaded);
 
@@ -175,26 +185,42 @@ unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
   //     register from the list to consider, it means that when this
   //     register is being used for the scratch wave offset and there
   //     are no other free SGPRs, then the value will stay in this register.
+  // + 1 if stack pointer is used.
   // ----
-  //  13
-  if (AllSGPRs.size() < 13)
-    return ScratchWaveOffsetReg;
+  //  13 (+1)
+  unsigned ReservedRegCount = 13;
+  if (SPReg != AMDGPU::NoRegister)
+    ++ReservedRegCount;
 
-  for (MCPhysReg Reg : AllSGPRs.drop_back(13)) {
+  if (AllSGPRs.size() < ReservedRegCount)
+    return std::make_pair(ScratchWaveOffsetReg, SPReg);
+
+  bool HandledScratchWaveOffsetReg =
+    ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+
+  for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) {
     // Pick the first unallocated SGPR. Be careful not to pick an alias of the
     // scratch descriptor, since we haven’t added its uses yet.
-    if (!MRI.isPhysRegUsed(Reg)) {
-      if (!MRI.isAllocatable(Reg) ||
-          TRI->isSubRegisterEq(ScratchRsrcReg, Reg))
-        continue;
+    if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
+      if (!HandledScratchWaveOffsetReg) {
+        HandledScratchWaveOffsetReg = true;
 
-      MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
-      MFI->setScratchWaveOffsetReg(Reg);
-      return Reg;
+        MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
+        MFI->setScratchWaveOffsetReg(Reg);
+        ScratchWaveOffsetReg = Reg;
+      } else {
+        if (SPReg == AMDGPU::NoRegister)
+          break;
+
+        MRI.replaceRegWith(SPReg, Reg);
+        MFI->setStackPtrOffsetReg(Reg);
+        SPReg = Reg;
+        break;
+      }
     }
   }
 
-  return ScratchWaveOffsetReg;
+  return std::make_pair(ScratchWaveOffsetReg, SPReg);
 }
 
 void SIFrameLowering::emitPrologue(MachineFunction &MF,
@@ -220,18 +246,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  unsigned ScratchRsrcReg
-    = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
-  unsigned ScratchWaveOffsetReg
-    = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
-
-  if (ScratchRsrcReg == AMDGPU::NoRegister) {
-    assert(ScratchWaveOffsetReg == AMDGPU::NoRegister);
-    return;
-  }
-
-  assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
-
   // We need to do the replacement of the private segment buffer and wave offset
   // register even if there are no stack objects. There could be stores to undef
   // or a constant without an associated object.
@@ -244,19 +258,49 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit())
     emitFlatScratchInit(ST, MF, MBB);
 
+  unsigned SPReg = MFI->getStackPtrOffsetReg();
+  if (SPReg != AMDGPU::NoRegister) {
+    DebugLoc DL;
+    int64_t StackSize = MF.getFrameInfo().getStackSize();
+
+    if (StackSize == 0) {
+      BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg)
+        .addReg(MFI->getScratchWaveOffsetReg());
+    } else {
+      BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::S_ADD_U32), SPReg)
+        .addReg(MFI->getScratchWaveOffsetReg())
+        .addImm(StackSize * ST.getWavefrontSize());
+    }
+  }
+
+  unsigned ScratchRsrcReg
+    = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
+
+  unsigned ScratchWaveOffsetReg;
+  std::tie(ScratchWaveOffsetReg, SPReg)
+    = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
+
+  // It's possible to have uses of only ScratchWaveOffsetReg without
+  // ScratchRsrcReg if it's only used for the initialization of flat_scratch,
+  // but the inverse is not true.
+  if (ScratchWaveOffsetReg == AMDGPU::NoRegister) {
+    assert(ScratchRsrcReg == AMDGPU::NoRegister);
+    return;
+  }
+
   // We need to insert initialization of the scratch resource descriptor.
   unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
     MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
 
-
   unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
   if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) {
     PreloadedPrivateBufferReg = TRI->getPreloadedValue(
       MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
   }
 
-  bool OffsetRegUsed = !MRI.use_empty(ScratchWaveOffsetReg);
-  bool ResourceRegUsed = !MRI.use_empty(ScratchRsrcReg);
+  bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg);
+  bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister &&
+                         MRI.isPhysRegUsed(ScratchRsrcReg);
 
   // We added live-ins during argument lowering, but since they were not used
   // they were deleted. We're adding the uses now, so add them back.
@@ -469,7 +513,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
     // this also ensures we shouldn't need a register for the offset when
     // emergency scavenging.
     int ScavengeFI = MFI.CreateFixedObject(
-      AMDGPU::SGPR_32RegClass.getSize(), 0, false);
+      TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
     RS->addScavengingFrameIndex(ScavengeFI);
   }
 }
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index 1bfc08093da2..7ccd02b3c86a 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -49,7 +49,7 @@ private:
     SIMachineFunctionInfo *MFI,
     MachineFunction &MF) const;
 
-  unsigned getReservedPrivateSegmentWaveByteOffsetReg(
+  std::pair<unsigned, unsigned> getReservedPrivateSegmentWaveByteOffsetReg(
     const SISubtarget &ST,
     const SIInstrInfo *TII,
     const SIRegisterInfo *TRI,
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index dd867b15b4c7..ce74a7cd8b04 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -287,8 +287,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   // On SI this is s_memtime and s_memrealtime on VI.
   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
-  setOperationAction(ISD::TRAP, MVT::Other, Legal);
-  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
+  setOperationAction(ISD::TRAP, MVT::Other, Custom);
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
 
   setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
   setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
@@ -1644,7 +1644,7 @@ computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
                             const TargetRegisterClass *SuperRC,
                             unsigned VecReg,
                             int Offset) {
-  int NumElts = SuperRC->getSize() / 4;
+  int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
 
   // Skip out of bounds offsets, or else we would end up using an undefined
   // register.
@@ -1793,17 +1793,18 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
   return LoopBB;
 }
 
-static unsigned getMOVRELDPseudo(const TargetRegisterClass *VecRC) {
-  switch (VecRC->getSize()) {
-  case 4:
+static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
+                                 const TargetRegisterClass *VecRC) {
+  switch (TRI.getRegSizeInBits(*VecRC)) {
+  case 32: // 4 bytes
     return AMDGPU::V_MOVRELD_B32_V1;
-  case 8:
+  case 64: // 8 bytes
     return AMDGPU::V_MOVRELD_B32_V2;
-  case 16:
+  case 128: // 16 bytes
     return AMDGPU::V_MOVRELD_B32_V4;
-  case 32:
+  case 256: // 32 bytes
     return AMDGPU::V_MOVRELD_B32_V8;
-  case 64:
+  case 512: // 64 bytes
     return AMDGPU::V_MOVRELD_B32_V16;
   default:
     llvm_unreachable("unsupported size for MOVRELD pseudos");
@@ -1863,7 +1864,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
 
       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
     } else {
-      const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC));
+      const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
 
       BuildMI(MBB, I, DL, MovRelDesc)
           .addReg(Dst, RegState::Define)
@@ -1907,7 +1908,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
         .addReg(PhiReg, RegState::Implicit)
         .addReg(AMDGPU::M0, RegState::Implicit);
   } else {
-    const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC));
+    const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
 
     BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
         .addReg(Dst, RegState::Define)
@@ -1948,50 +1949,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   }
 
   switch (MI.getOpcode()) {
-  case AMDGPU::S_TRAP_PSEUDO: {
-    const DebugLoc &DL = MI.getDebugLoc();
-    const int TrapType = MI.getOperand(0).getImm();
-
-    if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa &&
-        Subtarget->isTrapHandlerEnabled()) {
-
-      MachineFunction *MF = BB->getParent();
-      SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
-      unsigned UserSGPR = Info->getQueuePtrUserSGPR();
-      assert(UserSGPR != AMDGPU::NoRegister);
-
-      if (!BB->isLiveIn(UserSGPR))
-        BB->addLiveIn(UserSGPR);
-
-      BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::SGPR0_SGPR1)
-        .addReg(UserSGPR);
-      BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_TRAP))
-        .addImm(TrapType)
-        .addReg(AMDGPU::SGPR0_SGPR1, RegState::Implicit);
-    } else {
-      switch (TrapType) {
-      case SISubtarget::TrapIDLLVMTrap:
-        BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_ENDPGM));
-        break;
-      case SISubtarget::TrapIDLLVMDebugTrap: {
-        DiagnosticInfoUnsupported NoTrap(*MF->getFunction(),
-                                         "debugtrap handler not supported",
-                                         DL,
-                                         DS_Warning);
-        LLVMContext &C = MF->getFunction()->getContext();
-        C.diagnose(NoTrap);
-        BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_NOP))
-          .addImm(0);
-        break;
-      }
-      default:
-        llvm_unreachable("unsupported trap handler type!");
-      }
-    }
-
-    MI.eraseFromParent();
-    return BB;
-  }
   case AMDGPU::SI_INIT_M0:
     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
@@ -2163,6 +2120,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::FP_ROUND:
     return lowerFP_ROUND(Op, DAG);
+
+  case ISD::TRAP:
+  case ISD::DEBUGTRAP:
+    return lowerTRAP(Op, DAG);
   }
   return SDValue();
 }
@@ -2431,6 +2392,57 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);;
 }
 
+SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  MachineFunction &MF = DAG.getMachineFunction();
+  SDValue Chain = Op.getOperand(0);
+
+  unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ?
+    SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap;
+
+  if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa &&
+      Subtarget->isTrapHandlerEnabled()) {
+    SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+    unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+    assert(UserSGPR != AMDGPU::NoRegister);
+
+    SDValue QueuePtr = CreateLiveInRegister(
+      DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+
+    SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
+
+    SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
+                                     QueuePtr, SDValue());
+
+    SDValue Ops[] = {
+      ToReg,
+      DAG.getTargetConstant(TrapID, SL, MVT::i16),
+      SGPR01,
+      ToReg.getValue(1)
+    };
+
+    return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
+  }
+
+  switch (TrapID) {
+  case SISubtarget::TrapIDLLVMTrap:
+    return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
+  case SISubtarget::TrapIDLLVMDebugTrap: {
+    DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
+                                     "debugtrap handler not supported",
+                                     Op.getDebugLoc(),
+                                     DS_Warning);
+    LLVMContext &Ctx = MF.getFunction()->getContext();
+    Ctx.diagnose(NoTrap);
+    return Chain;
+  }
+  default:
+    llvm_unreachable("unsupported trap handler type!");
+  }
+
+  return Chain;
+}
+
 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
                                              SelectionDAG &DAG) const {
   // FIXME: Use inline constants (src_{shared, private}_base) instead.
@@ -3410,9 +3422,11 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
   EVT VT = Op.getValueType();
   bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
 
+  if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
+    return SDValue();
+
   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
-    if (Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
-        VT == MVT::f16) {
+    if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
       if (CLHS->isExactlyValue(1.0)) {
         // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
         // the CI documentation has a worst case error of 1 ulp.
@@ -4696,7 +4710,7 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                         !DCI.isBeforeLegalizeOps());
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
+  if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) ||
       TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
     DCI.CommitTargetLoweringOpt(TLO);
   }
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index c2a3e62aa827..9122cd72d323 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -428,8 +428,8 @@ RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
 
   const MachineInstr &MIA = *MI;
   const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
-  unsigned Size = RC->getSize();
-  Result.second = Result.first + (Size / 4);
+  unsigned Size = TRI->getRegSizeInBits(*RC);
+  Result.second = Result.first + (Size / 32);
 
   return Result;
 }
diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp
index 47257ce16ceb..9f32ecfa52ff 100644
--- a/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -216,8 +216,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
 
         // XXX - What if this is a write into a super register?
         const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
-        unsigned Size = RC->getSize();
-        Result.Named.LGKM = Size > 4 ? 2 : 1;
+        unsigned Size = TRI->getRegSizeInBits(*RC);
+        Result.Named.LGKM = Size > 32 ? 2 : 1;
       } else {
         // s_dcache_inv etc. do not have a a destination register. Assume we
         // want a wait on these.
@@ -289,12 +289,12 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
 
 RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
                                           const MachineOperand &Reg) const {
-  unsigned Size = RC->getSize();
-  assert(Size >= 4);
+  unsigned Size = TRI->getRegSizeInBits(*RC);
+  assert(Size >= 32);
 
   RegInterval Result;
   Result.first = TRI->getEncodingValue(Reg.getReg());
-  Result.second = Result.first + Size / 4;
+  Result.second = Result.first + Size / 32;
 
   return Result;
 }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 05ac67d26620..92e452a3d6a0 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -138,6 +138,11 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
   }
 
   if (isSMRD(Opc0) && isSMRD(Opc1)) {
+    // Skip time and cache invalidation instructions.
+    if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
+        AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
+      return false;
+
     assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
 
     // Check base reg.
@@ -245,11 +250,11 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
 
       unsigned EltSize;
       if (LdSt.mayLoad())
-        EltSize = getOpRegClass(LdSt, 0)->getSize() / 2;
+        EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
       else {
         assert(LdSt.mayStore());
         int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
-        EltSize = getOpRegClass(LdSt, Data0Idx)->getSize();
+        EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
       }
 
       if (isStride64(Opc))
@@ -345,7 +350,7 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
       FirstLdSt.getParent()->getParent()->getRegInfo();
   const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
 
-  return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold;
+  return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
 }
 
 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
@@ -433,7 +438,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   unsigned EltSize = 4;
   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
   if (RI.isSGPRClass(RC)) {
-    if (RC->getSize() > 4) {
+    if (RI.getRegSizeInBits(*RC) > 32) {
       Opcode =  AMDGPU::S_MOV_B64;
       EltSize = 8;
     } else {
@@ -493,11 +498,11 @@ int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
 
 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
 
-  if (DstRC->getSize() == 4) {
+  if (RI.getRegSizeInBits(*DstRC) == 32) {
     return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
-  } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) {
+  } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
     return AMDGPU::S_MOV_B64;
-  } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) {
+  } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
     return  AMDGPU::V_MOV_B64_PSEUDO;
   }
   return AMDGPU::COPY;
@@ -557,17 +562,18 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   MachineMemOperand *MMO
     = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
                                Size, Align);
+  unsigned SpillSize = TRI->getSpillSize(*RC);
 
   if (RI.isSGPRClass(RC)) {
     MFI->setHasSpilledSGPRs();
 
     // We are only allowed to create one new instruction when spilling
     // registers, so we need to use pseudo instruction for spilling SGPRs.
-    const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(RC->getSize()));
+    const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
 
     // The SGPR spill/restore instructions only work on number sgprs, so we need
     // to make sure we are using the correct register class.
-    if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) {
+    if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
       MachineRegisterInfo &MRI = MF->getRegInfo();
       MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
     }
@@ -602,7 +608,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 
-  unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
+  unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
   MFI->setHasSpilledVGPRs();
   BuildMI(MBB, MI, DL, get(Opcode))
     .addReg(SrcReg, getKillRegState(isKill)) // data
@@ -660,6 +666,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   DebugLoc DL = MBB.findDebugLoc(MI);
   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
+  unsigned SpillSize = TRI->getSpillSize(*RC);
 
   MachinePointerInfo PtrInfo
     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
@@ -670,8 +677,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   if (RI.isSGPRClass(RC)) {
     // FIXME: Maybe this should not include a memoperand because it will be
     // lowered to non-memory instructions.
-    const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(RC->getSize()));
-    if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) {
+    const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
+    if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
       MachineRegisterInfo &MRI = MF->getRegInfo();
       MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
     }
@@ -701,7 +708,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 
   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 
-  unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
+  unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
     .addFrameIndex(FrameIndex)              // vaddr
     .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
@@ -1440,9 +1447,9 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
 
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
-  unsigned DstSize = DstRC->getSize();
+  unsigned DstSize = RI.getRegSizeInBits(*DstRC);
 
-  if (DstSize == 4) {
+  if (DstSize == 32) {
     unsigned SelOp = Pred == SCC_TRUE ?
       AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
 
@@ -1456,7 +1463,7 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
     return;
   }
 
-  if (DstSize == 8 && Pred == SCC_TRUE) {
+  if (DstSize == 64 && Pred == SCC_TRUE) {
     MachineInstr *Select =
       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
       .addReg(FalseReg)
@@ -1483,7 +1490,7 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
   unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
   const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
   const int16_t *SubIndices = Sub0_15;
-  int NElts = DstSize / 4;
+  int NElts = DstSize / 32;
 
   // 64-bit select is only avaialble for SALU.
   if (Pred == SCC_TRUE) {
@@ -2635,6 +2642,19 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
   if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
     return;
 
+  // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
+  // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
+  // select is uniform.
+  if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
+      RI.isVGPR(MRI, Src1.getReg())) {
+    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+    const DebugLoc &DL = MI.getDebugLoc();
+    BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+        .add(Src1);
+    Src1.ChangeToRegister(Reg, false);
+    return;
+  }
+
   // We do not use commuteInstruction here because it is too aggressive and will
   // commute if it is possible. We only want to commute here if it improves
   // legality. This can be called a fairly large number of times so don't waste
@@ -2729,7 +2749,7 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
   const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
   const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
   unsigned DstReg = MRI.createVirtualRegister(SRC);
-  unsigned SubRegs = VRC->getSize() / 4;
+  unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
 
   SmallVector<unsigned, 8> SRegs;
   for (unsigned i = 0; i < SubRegs; ++i) {
@@ -3595,7 +3615,7 @@ void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
       .addImm(16)
       .add(Src0);
     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
-      .addImm(0xffff);
+      .addImm(0xffff0000);
     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
       .add(Src1)
       .addReg(ImmReg, RegState::Kill)
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 659473ca6a47..03a5ef74b179 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -626,13 +626,13 @@ public:
       return 4;
     }
 
-    return RI.getRegClass(OpInfo.RegClass)->getSize();
+    return RI.getRegSizeInBits(*RI.getRegClass(OpInfo.RegClass)) / 8;
   }
 
   /// \brief This form should usually be preferred since it handles operands
   /// with unknown register classes.
   unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const {
-    return getOpRegClass(MI, OpNo)->getSize();
+    return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8;
   }
 
   /// \returns true if it is legal for the operand at index \p OpNo
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index c6daf743f3ac..7b052844f177 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -646,11 +646,10 @@ def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
 def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">;
 
 def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
-def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">;
 def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
 def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">;
 def VOP3Mods  : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
-def VOP3NoMods : ComplexPattern<untyped, 2, "SelectVOP3NoMods">;
+def VOP3NoMods : ComplexPattern<untyped, 1, "SelectVOP3NoMods">;
 // VOP3Mods, but the input source is known to never be NaN.
 def VOP3Mods_nnan : ComplexPattern<fAny, 2, "SelectVOP3Mods_NNaN">;
 
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 2f89503e129a..3f6ddec70479 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -94,6 +94,12 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
 //===----------------------------------------------------------------------===//
 // Pseudo Instructions
 //===----------------------------------------------------------------------===//
+def ATOMIC_FENCE : SPseudoInstSI<
+  (outs), (ins i32imm:$ordering, i32imm:$scope),
+  [(atomic_fence (i32 imm:$ordering), (i32 imm:$scope))],
+  "ATOMIC_FENCE $ordering, $scope"> {
+  let hasSideEffects = 1;
+}
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
 
@@ -111,12 +117,6 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
                                       (ins VSrc_b64:$src0)>;
 } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
 
-def S_TRAP_PSEUDO : SPseudoInstSI <(outs), (ins i16imm:$simm16)> {
-  let hasSideEffects = 1;
-  let SALU = 1;
-  let usesCustomInserter = 1;
-}
-
 let usesCustomInserter = 1, SALU = 1 in {
 def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
   [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
@@ -400,13 +400,8 @@ def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
 
 let Predicates = [isGCN] in {
 def : Pat<
-  (trap),
-  (S_TRAP_PSEUDO TRAPID.LLVM_TRAP)
->;
-
-def : Pat<
-  (debugtrap),
-  (S_TRAP_PSEUDO TRAPID.LLVM_DEBUG_TRAP)
+  (AMDGPUtrap timm:$trapid),
+  (S_TRAP $trapid)
 >;
 
 def : Pat<
@@ -477,8 +472,8 @@ def : Pat <
 
 // fp_to_fp16 patterns
 def : Pat <
-  (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)))),
-  (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, $clamp, $omod)
+  (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+  (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
 
 def : Pat <
@@ -507,11 +502,11 @@ def : Pat <
 
 multiclass FMADPat <ValueType vt, Instruction inst> {
   def : Pat <
-    (vt (fmad (VOP3NoMods0 vt:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
-              (VOP3NoMods  vt:$src1, i32:$src1_modifiers),
-              (VOP3NoMods  vt:$src2, i32:$src2_modifiers))),
-    (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
-          $src2_modifiers, $src2, $clamp, $omod)
+    (vt (fmad (VOP3NoMods vt:$src0),
+              (VOP3NoMods vt:$src1),
+              (VOP3NoMods vt:$src2))),
+    (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
+          SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
   >;
 }
 
@@ -681,10 +676,9 @@ def : BitConvert <v16f32, v16i32, VReg_512>;
 // If denormals are not enabled, it only impacts the compare of the
 // inputs. The output result is not flushed.
 class ClampPat<Instruction inst, ValueType vt> : Pat <
-  (vt (AMDGPUclamp
-        (VOP3Mods0Clamp vt:$src0, i32:$src0_modifiers, i32:$omod))),
+  (vt (AMDGPUclamp (VOP3Mods vt:$src0, i32:$src0_modifiers))),
   (inst i32:$src0_modifiers, vt:$src0,
-        i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, $omod)
+        i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, DSTOMOD.NONE)
 >;
 
 def : ClampPat<V_MAX_F32_e64, f32>;
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 8e612d2ddfda..b6a982aee6be 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -25,6 +25,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     TIDReg(AMDGPU::NoRegister),
     ScratchRSrcReg(AMDGPU::NoRegister),
     ScratchWaveOffsetReg(AMDGPU::NoRegister),
+    FrameOffsetReg(AMDGPU::NoRegister),
+    StackPtrOffsetReg(AMDGPU::NoRegister),
     PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
     DispatchPtrUserSGPR(AMDGPU::NoRegister),
     QueuePtrUserSGPR(AMDGPU::NoRegister),
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 810fb05984c4..dc9f509e60ae 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -88,6 +88,14 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   unsigned ScratchRSrcReg;
   unsigned ScratchWaveOffsetReg;
 
+  // This is the current function's incremented size from the kernel's scratch
+  // wave offset register. For an entry function, this is exactly the same as
+  // the ScratchWaveOffsetReg.
+  unsigned FrameOffsetReg;
+
+  // Top of the stack SGPR offset derived from the ScratchWaveOffsetReg.
+  unsigned StackPtrOffsetReg;
+
   // Input registers for non-HSA ABI
   unsigned PrivateMemoryPtrUserSGPR;
 
@@ -364,9 +372,25 @@ public:
     return ScratchWaveOffsetReg;
   }
 
+  unsigned getFrameOffsetReg() const {
+    return FrameOffsetReg;
+  }
+
+  void setStackPtrOffsetReg(unsigned Reg) {
+    assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+    StackPtrOffsetReg = Reg;
+  }
+
+  unsigned getStackPtrOffsetReg() const {
+    return StackPtrOffsetReg;
+  }
+
   void setScratchWaveOffsetReg(unsigned Reg) {
     assert(Reg != AMDGPU::NoRegister && "Should never be unset");
     ScratchWaveOffsetReg = Reg;
+
+    // FIXME: Only for entry functions.
+    FrameOffsetReg = ScratchWaveOffsetReg;
   }
 
   unsigned getQueuePtrUserSGPR() const {
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 098c67252dd8..8820e294562b 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -146,6 +146,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserveRegisterTuples(Reserved, AMDGPU::EXEC);
   reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
 
+  // M0 has to be reserved so that llvm accepts it as a live-in into a block.
+  reserveRegisterTuples(Reserved, AMDGPU::M0);
+
   // Reserve the memory aperture registers.
   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
@@ -615,7 +618,8 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
   if (SpillToSMEM && isSGPRClass(RC)) {
     // XXX - if private_element_size is larger than 4 it might be useful to be
     // able to spill wider vmem spills.
-    std::tie(EltSize, ScalarStoreOp) = getSpillEltSize(RC->getSize(), true);
+    std::tie(EltSize, ScalarStoreOp) =
+          getSpillEltSize(getRegSizeInBits(*RC) / 8, true);
   }
 
   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
@@ -775,7 +779,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
   if (SpillToSMEM && isSGPRClass(RC)) {
     // XXX - if private_element_size is larger than 4 it might be useful to be
     // able to spill wider vmem spills.
-    std::tie(EltSize, ScalarLoadOp) = getSpillEltSize(RC->getSize(), false);
+    std::tie(EltSize, ScalarLoadOp) =
+          getSpillEltSize(getRegSizeInBits(*RC) / 8, false);
   }
 
   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
@@ -1038,20 +1043,21 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
 // TODO: It might be helpful to have some target specific flags in
 // TargetRegisterClass to mark which classes are VGPRs to make this trivial.
 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
-  switch (RC->getSize()) {
-  case 0: return false;
-  case 1: return false;
-  case 4:
+  unsigned Size = getRegSizeInBits(*RC);
+  if (Size < 32)
+    return false;
+  switch (Size) {
+  case 32:
     return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
-  case 8:
+  case 64:
     return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr;
-  case 12:
+  case 96:
     return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
-  case 16:
+  case 128:
     return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
-  case 32:
+  case 256:
     return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
-  case 64:
+  case 512:
     return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
   default:
     llvm_unreachable("Invalid register class size");
@@ -1060,18 +1066,18 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
 
 const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
                                          const TargetRegisterClass *SRC) const {
-  switch (SRC->getSize()) {
-  case 4:
+  switch (getRegSizeInBits(*SRC)) {
+  case 32:
     return &AMDGPU::VGPR_32RegClass;
-  case 8:
+  case 64:
     return &AMDGPU::VReg_64RegClass;
-  case 12:
+  case 96:
     return &AMDGPU::VReg_96RegClass;
-  case 16:
+  case 128:
     return &AMDGPU::VReg_128RegClass;
-  case 32:
+  case 256:
     return &AMDGPU::VReg_256RegClass;
-  case 64:
+  case 512:
     return &AMDGPU::VReg_512RegClass;
   default:
     llvm_unreachable("Invalid register class size");
@@ -1080,16 +1086,16 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
 
 const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
                                          const TargetRegisterClass *VRC) const {
-  switch (VRC->getSize()) {
-  case 4:
+  switch (getRegSizeInBits(*VRC)) {
+  case 32:
     return &AMDGPU::SGPR_32RegClass;
-  case 8:
+  case 64:
     return &AMDGPU::SReg_64RegClass;
-  case 16:
+  case 128:
     return &AMDGPU::SReg_128RegClass;
-  case 32:
+  case 256:
     return &AMDGPU::SReg_256RegClass;
-  case 64:
+  case 512:
     return &AMDGPU::SReg_512RegClass;
   default:
     llvm_unreachable("Invalid register class size");
@@ -1354,15 +1360,15 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
                                     const TargetRegisterClass *DstRC,
                                     unsigned DstSubReg,
                                     const TargetRegisterClass *NewRC) const {
-  unsigned SrcSize = SrcRC->getSize();
-  unsigned DstSize = DstRC->getSize();
-  unsigned NewSize = NewRC->getSize();
+  unsigned SrcSize = getRegSizeInBits(*SrcRC);
+  unsigned DstSize = getRegSizeInBits(*DstRC);
+  unsigned NewSize = getRegSizeInBits(*NewRC);
 
   // Do not increase size of registers beyond dword, we would need to allocate
   // adjacent registers and constraint regalloc more than needed.
 
   // Always allow dword coalescing.
-  if (SrcSize <= 4 || DstSize <= 4)
+  if (SrcSize <= 32 || DstSize <= 32)
     return true;
 
   return NewSize <= DstSize || NewSize <= SrcSize;
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index b4adbdd1df07..593439c2a3cd 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -530,14 +530,16 @@ class SOPKInstTable <bit is_sopk, string cmpOp = ""> {
 class SOPK_32 <string opName, list<dag> pattern=[]> : SOPK_Pseudo <
   opName,
   (outs SReg_32:$sdst),
-  (ins u16imm:$simm16),
+  (ins s16imm:$simm16),
   "$sdst, $simm16",
   pattern>;
 
-class SOPK_SCC <string opName, string base_op = ""> : SOPK_Pseudo <
+class SOPK_SCC <string opName, string base_op, bit isSignExt> : SOPK_Pseudo <
   opName,
   (outs),
-  (ins SReg_32:$sdst, u16imm:$simm16),
+  !if(isSignExt,
+      (ins SReg_32:$sdst, s16imm:$simm16),
+      (ins SReg_32:$sdst, u16imm:$simm16)),
   "$sdst, $simm16", []>,
   SOPKInstTable<1, base_op>{
   let Defs = [SCC];
@@ -546,7 +548,7 @@ class SOPK_SCC <string opName, string base_op = ""> : SOPK_Pseudo <
 class SOPK_32TIE <string opName, list<dag> pattern=[]> : SOPK_Pseudo <
   opName,
   (outs SReg_32:$sdst),
-  (ins SReg_32:$src0, u16imm:$simm16),
+  (ins SReg_32:$src0, s16imm:$simm16),
   "$sdst, $simm16",
   pattern
 >;
@@ -575,20 +577,20 @@ let isCompare = 1 in {
 //   [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))]
 // >;
 
-def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32", "s_cmp_eq_i32">;
-def S_CMPK_LG_I32 : SOPK_SCC <"s_cmpk_lg_i32", "s_cmp_lg_i32">;
-def S_CMPK_GT_I32 : SOPK_SCC <"s_cmpk_gt_i32", "s_cmp_gt_i32">;
-def S_CMPK_GE_I32 : SOPK_SCC <"s_cmpk_ge_i32", "s_cmp_ge_i32">;
-def S_CMPK_LT_I32 : SOPK_SCC <"s_cmpk_lt_i32", "s_cmp_lt_i32">;
-def S_CMPK_LE_I32 : SOPK_SCC <"s_cmpk_le_i32", "s_cmp_le_i32">;
+def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32", "s_cmp_eq_i32", 1>;
+def S_CMPK_LG_I32 : SOPK_SCC <"s_cmpk_lg_i32", "s_cmp_lg_i32", 1>;
+def S_CMPK_GT_I32 : SOPK_SCC <"s_cmpk_gt_i32", "s_cmp_gt_i32", 1>;
+def S_CMPK_GE_I32 : SOPK_SCC <"s_cmpk_ge_i32", "s_cmp_ge_i32", 1>;
+def S_CMPK_LT_I32 : SOPK_SCC <"s_cmpk_lt_i32", "s_cmp_lt_i32", 1>;
+def S_CMPK_LE_I32 : SOPK_SCC <"s_cmpk_le_i32", "s_cmp_le_i32", 1>;
 
 let SOPKZext = 1 in {
-def S_CMPK_EQ_U32 : SOPK_SCC <"s_cmpk_eq_u32", "s_cmp_eq_u32">;
-def S_CMPK_LG_U32 : SOPK_SCC <"s_cmpk_lg_u32", "s_cmp_lg_u32">;
-def S_CMPK_GT_U32 : SOPK_SCC <"s_cmpk_gt_u32", "s_cmp_gt_u32">;
-def S_CMPK_GE_U32 : SOPK_SCC <"s_cmpk_ge_u32", "s_cmp_ge_u32">;
-def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32">;
-def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32">;
+def S_CMPK_EQ_U32 : SOPK_SCC <"s_cmpk_eq_u32", "s_cmp_eq_u32", 0>;
+def S_CMPK_LG_U32 : SOPK_SCC <"s_cmpk_lg_u32", "s_cmp_lg_u32", 0>;
+def S_CMPK_GT_U32 : SOPK_SCC <"s_cmpk_gt_u32", "s_cmp_gt_u32", 0>;
+def S_CMPK_GE_U32 : SOPK_SCC <"s_cmpk_ge_u32", "s_cmp_ge_u32", 0>;
+def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32", 0>;
+def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32", 0>;
 } // End SOPKZext = 1
 } // End isCompare = 1
 
@@ -600,7 +602,7 @@ let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0",
 
 def S_CBRANCH_I_FORK : SOPK_Pseudo <
   "s_cbranch_i_fork",
-  (outs), (ins SReg_64:$sdst, u16imm:$simm16),
+  (outs), (ins SReg_64:$sdst, s16imm:$simm16),
   "$sdst, $simm16"
 >;
 
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 86095a8e1142..5a3242bed1d0 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -93,6 +93,12 @@ unsigned getVmcntBitWidthHi() { return 2; }
 } // end namespace anonymous
 
 namespace llvm {
+
+static cl::opt<bool> EnablePackedInlinableLiterals(
+    "enable-packed-inlinable-literals",
+    cl::desc("Enable packed inlinable literals (v2f16, v2i16)"),
+    cl::init(false));
+
 namespace AMDGPU {
 
 namespace IsaInfo {
@@ -703,6 +709,9 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
 bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
   assert(HasInv2Pi);
 
+  if (!EnablePackedInlinableLiterals)
+    return false;
+
   int16_t Lo16 = static_cast<int16_t>(Literal);
   int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
   return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 4f5711ca9a79..5c9d589e2625 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -905,7 +905,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
       MFI.getObjectSize(FI), Align);
 
-  switch (RC->getSize()) {
+  switch (TRI->getSpillSize(*RC)) {
     case 4:
       if (ARM::GPRRegClass.hasSubClassEq(RC)) {
         BuildMI(MBB, I, DL, get(ARM::STRi12))
@@ -1103,7 +1103,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
       MFI.getObjectSize(FI), Align);
 
-  switch (RC->getSize()) {
+  switch (TRI->getSpillSize(*RC)) {
   case 4:
     if (ARM::GPRRegClass.hasSubClassEq(RC)) {
       BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg)
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index faf1c631a3a7..28c407f74125 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -105,10 +105,6 @@ public:
   // Return whether the target has an explicit NOP encoding.
   bool hasNOP() const;
 
-  virtual void getNoopForElfTarget(MCInst &NopInst) const {
-    getNoopForMachoTarget(NopInst);
-  }
-
   // Return the non-pre/post incrementing version of 'Opc'. Return 0
   // if there is not such an opcode.
   virtual unsigned getUnindexedOpcode(unsigned Opc) const = 0;
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 70a44eaaceb8..a20887564f44 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -806,7 +806,8 @@ bool ARMBaseRegisterInfo::shouldCoalesce(MachineInstr *MI,
   if (!DstSubReg)
     return true;
   // Small registers don't frequently cause a problem, so we can coalesce them.
-  if (NewRC->getSize() < 32 && DstRC->getSize() < 32 && SrcRC->getSize() < 32)
+  if (getRegSizeInBits(*NewRC) < 256 && getRegSizeInBits(*DstRC) < 256 &&
+      getRegSizeInBits(*SrcRC) < 256)
     return true;
 
   auto NewRCWeight =
diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp
index 94b317a8f986..13fb30767c9f 100644
--- a/lib/Target/ARM/ARMCallLowering.cpp
+++ b/lib/Target/ARM/ARMCallLowering.cpp
@@ -35,7 +35,8 @@ ARMCallLowering::ARMCallLowering(const ARMTargetLowering &TLI)
 static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI,
                             Type *T) {
   EVT VT = TLI.getValueType(DL, T, true);
-  if (!VT.isSimple() || VT.isVector())
+  if (!VT.isSimple() || VT.isVector() ||
+      !(VT.isInteger() || VT.isFloatingPoint()))
     return false;
 
   unsigned VTSize = VT.getSimpleVT().getSizeInBits();
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index e0aecff2633b..78a9144bd321 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -661,7 +661,6 @@ static bool IsAnAddressOperand(const MachineOperand &MO) {
     return false;
   case MachineOperand::MO_IntrinsicID:
   case MachineOperand::MO_Predicate:
-  case MachineOperand::MO_Placeholder:
     llvm_unreachable("should not exist post-isel");
   }
   llvm_unreachable("unhandled machine operand type");
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 70dbe1bc5b95..4f7a0ab4e220 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -1960,10 +1960,10 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
         // note: Thumb1 functions spill to R12, not the stack.  Reserve a slot
         // closest to SP or frame pointer.
         assert(RS && "Register scavenging not provided");
-        const TargetRegisterClass *RC = &ARM::GPRRegClass;
-        RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
-                                                          RC->getAlignment(),
-                                                          false));
+        const TargetRegisterClass &RC = ARM::GPRRegClass;
+        unsigned Size = TRI->getSpillSize(RC);
+        unsigned Align = TRI->getSpillAlignment(RC);
+        RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false));
       }
     }
   }
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 165e9b7378c7..382f881f7741 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -3358,8 +3358,12 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
 
 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
                                  const ARMSubtarget *Subtarget) {
-  // FIXME: handle "fence singlethread" more efficiently.
   SDLoc dl(Op);
+  ConstantSDNode *ScopeN = cast<ConstantSDNode>(Op.getOperand(2));
+  auto Scope = static_cast<SynchronizationScope>(ScopeN->getZExtValue());
+  if (Scope == SynchronizationScope::SingleThread)
+    return Op;
+
   if (!Subtarget->hasDataBarrier()) {
     // Some ARMv6 cpus can support data barriers with an mcr instruction.
     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
@@ -9476,8 +9480,11 @@ AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1,
       return SDValue();
   }
 
-  // Don't generate vpaddl+vmovn; we'll match it to vpadd later.
-  if (Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
+  // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
+  // we're using the entire input vector, otherwise there's a size/legality
+  // mismatch somewhere.
+  if (nextIndex != Vec.getValueType().getVectorNumElements() ||
+      Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
     return SDValue();
 
   // Create VPADDL node.
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 3b3606ef462a..a0e2ac4cbc6f 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -32,8 +32,8 @@ using namespace llvm;
 ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
     : ARMBaseInstrInfo(STI), RI() {}
 
-/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
-void ARMInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+/// Return the noop instruction to use for a noop.
+void ARMInstrInfo::getNoop(MCInst &NopInst) const {
   if (hasNOP()) {
     NopInst.setOpcode(ARM::HINT);
     NopInst.addOperand(MCOperand::createImm(0));
diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h
index 4b1b7097b18d..c87fb97448c9 100644
--- a/lib/Target/ARM/ARMInstrInfo.h
+++ b/lib/Target/ARM/ARMInstrInfo.h
@@ -25,8 +25,8 @@ class ARMInstrInfo : public ARMBaseInstrInfo {
 public:
   explicit ARMInstrInfo(const ARMSubtarget &STI);
 
-  /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
-  void getNoopForMachoTarget(MCInst &NopInst) const override;
+  /// Return the noop instruction to use for a noop.
+  void getNoop(MCInst &NopInst) const override;
 
   // Return the non-pre/post incrementing version of 'Opc'. Return 0
   // if there is not such an opcode.
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 703e8071b177..9d8ee5c3f9dc 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -5975,3 +5975,10 @@ def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPR:$status),
                              (ins GPR:$addr, GPRPair:$desired, GPRPair:$new),
                              NoItinerary, []>, Sched<[]>;
 }
+
+def CompilerBarrier : PseudoInst<(outs), (ins i32imm:$ordering), NoItinerary,
+                                 [(atomic_fence imm:$ordering, 0)]> {
+  let hasSideEffects = 1;
+  let Size = 0;
+  let AsmString = "@ COMPILER BARRIER";
+}
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index f2f426e86701..8048c758e998 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -953,7 +953,7 @@ let isAdd = 1 in {
   /// These opcodes will be converted to the real non-S opcodes by
   /// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
   let hasPostISelHook = 1, Defs = [CPSR] in {
-    let isCommutable = 1 in
+    let isCommutable = 1, Uses = [CPSR] in
     def tADCS : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                             2, IIC_iALUr,
                             [(set tGPR:$Rdn, CPSR, (ARMadde tGPR:$Rn, tGPR:$Rm,
@@ -1292,6 +1292,7 @@ def tSUBrr :                    // A8.6.212
 /// These opcodes will be converted to the real non-S opcodes by
 /// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
 let hasPostISelHook = 1, Defs = [CPSR] in {
+  let Uses = [CPSR] in
   def tSBCS : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                           2, IIC_iALUr,
                           [(set tGPR:$Rdn, CPSR, (ARMsube tGPR:$Rn, tGPR:$Rm,
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 816596b85721..1c13d51a468e 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -47,12 +47,9 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
   unsigned SrcReg = I.getOperand(1).getReg();
   const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
   (void)SrcSize;
-  assert((DstSize == SrcSize ||
-          // Copies are a means to setup initial types, the number of
-          // bits may not exactly match.
-          (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
-           DstSize <= SrcSize)) &&
-         "Copy with different width?!");
+  // We use copies for trunc, so it's ok for the size of the destination to be
+  // smaller (the higher bits will just be undefined).
+  assert(DstSize <= SrcSize && "Copy with different width?!");
 
   assert((RegBank->getID() == ARM::GPRRegBankID ||
           RegBank->getID() == ARM::FPRRegBankID) &&
@@ -294,6 +291,28 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
     }
     break;
   }
+  case G_TRUNC: {
+    // The high bits are undefined, so there's nothing special to do, just
+    // treat it as a copy.
+    auto SrcReg = I.getOperand(1).getReg();
+    auto DstReg = I.getOperand(0).getReg();
+
+    const auto &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
+    const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+
+    if (SrcRegBank.getID() != DstRegBank.getID()) {
+      DEBUG(dbgs() << "G_TRUNC operands on different register banks\n");
+      return false;
+    }
+
+    if (SrcRegBank.getID() != ARM::GPRRegBankID) {
+      DEBUG(dbgs() << "G_TRUNC on non-GPR not supported yet\n");
+      return false;
+    }
+
+    I.setDesc(TII.get(COPY));
+    return selectCopy(I, TII, MRI, TRI, RBI);
+  }
   case G_ADD:
   case G_GEP:
     I.setDesc(TII.get(ARM::ADDrr));
@@ -313,6 +332,16 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
     }
     MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
     break;
+  case G_SDIV:
+    assert(TII.getSubtarget().hasDivideInARMMode() && "Unsupported operation");
+    I.setDesc(TII.get(ARM::SDIV));
+    MIB.add(predOps(ARMCC::AL));
+    break;
+  case G_UDIV:
+    assert(TII.getSubtarget().hasDivideInARMMode() && "Unsupported operation");
+    I.setDesc(TII.get(ARM::UDIV));
+    MIB.add(predOps(ARMCC::AL));
+    break;
   case G_FADD:
     if (!selectFAdd(MIB, TII, MRI))
       return false;
@@ -332,6 +361,18 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
            "Expected constant to live in a GPR");
     I.setDesc(TII.get(ARM::MOVi));
     MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
+
+    auto &Val = I.getOperand(1);
+    if (Val.isCImm()) {
+      if (Val.getCImm()->getBitWidth() > 32)
+        return false;
+      Val.ChangeToImmediate(Val.getCImm()->getZExtValue());
+    }
+
+    if (!Val.isImm()) {
+      return false;
+    }
+
     break;
   }
   case G_STORE:
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index fe9681439e6b..9b86030fdd29 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -13,6 +13,8 @@
 
 #include "ARMLegalizerInfo.h"
 #include "ARMSubtarget.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Type.h"
@@ -47,6 +49,18 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
     for (auto Ty : {s1, s8, s16, s32})
       setAction({Op, Ty}, Legal);
 
+  for (unsigned Op : {G_SDIV, G_UDIV}) {
+    for (auto Ty : {s8, s16})
+      // FIXME: We need WidenScalar here, but in the case of targets with
+      // software division we'll also need Libcall afterwards. Treat as Custom
+      // until we have better support for chaining legalization actions.
+      setAction({Op, Ty}, Custom);
+    if (ST.hasDivideInARMMode())
+      setAction({Op, s32}, Legal);
+    else
+      setAction({Op, s32}, Libcall);
+  }
+
   for (unsigned Op : {G_SEXT, G_ZEXT}) {
     setAction({Op, s32}, Legal);
     for (auto Ty : {s1, s8, s16})
@@ -75,3 +89,48 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
 
   computeTables();
 }
+
+bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
+                                      MachineRegisterInfo &MRI,
+                                      MachineIRBuilder &MIRBuilder) const {
+  using namespace TargetOpcode;
+
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case G_SDIV:
+  case G_UDIV: {
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    if (Ty != LLT::scalar(16) && Ty != LLT::scalar(8))
+      return false;
+
+    // We need to widen to 32 bits and then maybe, if the target requires,
+    // transform into a libcall.
+    LegalizerHelper Helper(MIRBuilder.getMF());
+
+    MachineInstr *NewMI = nullptr;
+    Helper.MIRBuilder.recordInsertions([&](MachineInstr *MI) {
+      // Store the new, 32-bit div instruction.
+      if (MI->getOpcode() == G_SDIV || MI->getOpcode() == G_UDIV)
+        NewMI = MI;
+    });
+
+    auto Result = Helper.widenScalar(MI, 0, LLT::scalar(32));
+    Helper.MIRBuilder.stopRecordingInsertions();
+    if (Result == LegalizerHelper::UnableToLegalize) {
+      return false;
+    }
+    assert(NewMI && "Couldn't find widened instruction");
+    assert((NewMI->getOpcode() == G_SDIV || NewMI->getOpcode() == G_UDIV) &&
+           "Unexpected widened instruction");
+    assert(MRI.getType(NewMI->getOperand(0).getReg()).getSizeInBits() == 32 &&
+           "Unexpected type for the widened instruction");
+
+    Result = Helper.legalizeInstrStep(*NewMI);
+    if (Result == LegalizerHelper::UnableToLegalize) {
+      return false;
+    }
+    return true;
+  }
+  }
+}
diff --git a/lib/Target/ARM/ARMLegalizerInfo.h b/lib/Target/ARM/ARMLegalizerInfo.h
index 0b8a608a6bde..a9bdd367737e 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.h
+++ b/lib/Target/ARM/ARMLegalizerInfo.h
@@ -24,6 +24,9 @@ class ARMSubtarget;
 class ARMLegalizerInfo : public LegalizerInfo {
 public:
   ARMLegalizerInfo(const ARMSubtarget &ST);
+
+  bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
+                      MachineIRBuilder &MIRBuilder) const override;
 };
 } // End llvm namespace.
 #endif
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index 0fd98268723a..9e9c1ba6c114 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -211,11 +211,9 @@ void ARMAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
     .addImm(ARMCC::AL).addReg(0));
 
   MCInst Noop;
-  Subtarget->getInstrInfo()->getNoopForElfTarget(Noop);
+  Subtarget->getInstrInfo()->getNoop(Noop);
   for (int8_t I = 0; I < NoopsInSledCount; I++)
-  {
     OutStreamer->EmitInstruction(Noop, getSubtargetInfo());
-  }
 
   OutStreamer->EmitLabel(Target);
   recordSled(CurSled, MI, Kind);
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index e47bd3a8963e..7325817d446b 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -221,8 +221,11 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case G_ADD:
   case G_SUB:
   case G_MUL:
+  case G_SDIV:
+  case G_UDIV:
   case G_SEXT:
   case G_ZEXT:
+  case G_TRUNC:
   case G_GEP:
     // FIXME: We're abusing the fact that everything lives in a GPR for now; in
     // the real world we would use different mappings.
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index 27bff4d75acf..0ebf55924647 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -24,8 +24,8 @@ using namespace llvm;
 Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI)
     : ARMBaseInstrInfo(STI), RI() {}
 
-/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
-void Thumb1InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+/// Return the noop instruction to use for a noop.
+void Thumb1InstrInfo::getNoop(MCInst &NopInst) const {
   NopInst.setOpcode(ARM::tMOVr);
   NopInst.addOperand(MCOperand::createReg(ARM::R8));
   NopInst.addOperand(MCOperand::createReg(ARM::R8));
diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h
index 931914ad2799..e8d9a9c4ff14 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/lib/Target/ARM/Thumb1InstrInfo.h
@@ -25,8 +25,8 @@ class Thumb1InstrInfo : public ARMBaseInstrInfo {
 public:
   explicit Thumb1InstrInfo(const ARMSubtarget &STI);
 
-  /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
-  void getNoopForMachoTarget(MCInst &NopInst) const override;
+  /// Return the noop instruction to use for a noop.
+  void getNoop(MCInst &NopInst) const override;
 
   // Return the non-pre/post incrementing version of 'Opc'. Return 0
   // if there is not such an opcode.
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 818ba85c7d40..2e2dfe035e26 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -32,8 +32,8 @@ OldT2IfCvt("old-thumb2-ifcvt", cl::Hidden,
 Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
     : ARMBaseInstrInfo(STI), RI() {}
 
-/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
-void Thumb2InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+/// Return the noop instruction to use for a noop.
+void Thumb2InstrInfo::getNoop(MCInst &NopInst) const {
   NopInst.setOpcode(ARM::tHINT);
   NopInst.addOperand(MCOperand::createImm(0));
   NopInst.addOperand(MCOperand::createImm(ARMCC::AL));
diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h
index 15d63300b6a2..c834ba73bfea 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/lib/Target/ARM/Thumb2InstrInfo.h
@@ -26,8 +26,8 @@ class Thumb2InstrInfo : public ARMBaseInstrInfo {
 public:
   explicit Thumb2InstrInfo(const ARMSubtarget &STI);
 
-  /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
-  void getNoopForMachoTarget(MCInst &NopInst) const override;
+  /// Return the noop instruction to use for a noop.
+  void getNoop(MCInst &NopInst) const override;
 
   // Return the non-pre/post incrementing version of 'Opc'. Return 0
   // if there is not such an opcode.
diff --git a/lib/Target/AVR/AVRAsmPrinter.cpp b/lib/Target/AVR/AVRAsmPrinter.cpp
index 50bb50b44f27..d6491ce5c3bf 100644
--- a/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -112,7 +112,8 @@ bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       const AVRSubtarget &STI = MF->getSubtarget<AVRSubtarget>();
       const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
 
-      unsigned BytesPerReg = TRI.getMinimalPhysRegClass(Reg)->getSize();
+      const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg);
+      unsigned BytesPerReg = TRI.getRegSizeInBits(*RC) / 8;
       assert(BytesPerReg <= 2 && "Only 8 and 16 bit regs are supported.");
 
       unsigned RegIdx = ByteNumber / BytesPerReg;
diff --git a/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/lib/Target/AVR/AVRExpandPseudoInsts.cpp
index 13080a5d72f0..540e05a92997 100644
--- a/lib/Target/AVR/AVRExpandPseudoInsts.cpp
+++ b/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -88,6 +88,9 @@ private:
                                 unsigned ArithOpcode,
                                 Block &MBB,
                                 BlockIt MBBI);
+
+  /// Scavenges a free GPR8 register for use.
+  unsigned scavengeGPR8(MachineInstr &MI);
 };
 
 char AVRExpandPseudo::ID = 0;
@@ -577,24 +580,43 @@ bool AVRExpandPseudo::expand<AVR::LDWRdPtr>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   unsigned OpLo, OpHi, DstLoReg, DstHiReg;
   unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned TmpReg = 0; // 0 for no temporary register
   unsigned SrcReg = MI.getOperand(1).getReg();
-  bool DstIsDead = MI.getOperand(0).isDead();
   bool SrcIsKill = MI.getOperand(1).isKill();
   OpLo = AVR::LDRdPtr;
   OpHi = AVR::LDDRdPtrQ;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
-  assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
+  // Use a temporary register if src and dst registers are the same.
+  if (DstReg == SrcReg)
+    TmpReg = scavengeGPR8(MI);
+
+  unsigned CurDstLoReg = (DstReg == SrcReg) ? TmpReg : DstLoReg;
+  unsigned CurDstHiReg = (DstReg == SrcReg) ? TmpReg : DstHiReg;
 
+  // Load low byte.
   auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(CurDstLoReg, RegState::Define)
     .addReg(SrcReg);
 
+  // Push low byte onto stack if necessary.
+  if (TmpReg)
+    buildMI(MBB, MBBI, AVR::PUSHRr).addReg(TmpReg);
+
+  // Load high byte.
   auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(CurDstHiReg, RegState::Define)
     .addReg(SrcReg, getKillRegState(SrcIsKill))
     .addImm(1);
 
+  if (TmpReg) {
+    // Move the high byte into the final destination.
+    buildMI(MBB, MBBI, AVR::MOVRdRr).addReg(DstHiReg).addReg(TmpReg);
+
+    // Move the low byte from the scratch space into the final destination.
+    buildMI(MBB, MBBI, AVR::POPRd).addReg(DstLoReg);
+  }
+
   MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
   MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
 
@@ -669,9 +691,9 @@ bool AVRExpandPseudo::expand<AVR::LDDWRdPtrQ>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   unsigned OpLo, OpHi, DstLoReg, DstHiReg;
   unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned TmpReg = 0; // 0 for no temporary register
   unsigned SrcReg = MI.getOperand(1).getReg();
   unsigned Imm = MI.getOperand(2).getImm();
-  bool DstIsDead = MI.getOperand(0).isDead();
   bool SrcIsKill = MI.getOperand(1).isKill();
   OpLo = AVR::LDDRdPtrQ;
   OpHi = AVR::LDDRdPtrQ;
@@ -679,60 +701,35 @@ bool AVRExpandPseudo::expand<AVR::LDDWRdPtrQ>(Block &MBB, BlockIt MBBI) {
 
   assert(Imm <= 63 && "Offset is out of range");
 
-  MachineInstr *MIBLO, *MIBHI;
-
-  // HACK: We shouldn't have instances of this instruction
-  // where src==dest because the instruction itself is
-  // marked earlyclobber. We do however get this instruction when
-  // loading from stack slots where the earlyclobber isn't useful.
-  //
-  // In this case, just use a temporary register.
-  if (DstReg == SrcReg) {
-    RegScavenger RS;
-
-    RS.enterBasicBlock(MBB);
-    RS.forward(MBBI);
-
-    BitVector Candidates =
-        TRI->getAllocatableSet
-        (*MBB.getParent(), &AVR::GPR8RegClass);
-
-    // Exclude all the registers being used by the instruction.
-    for (MachineOperand &MO : MI.operands()) {
-      if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() &&
-          !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-        Candidates.reset(MO.getReg());
-    }
-
-    BitVector Available = RS.getRegsAvailable(&AVR::GPR8RegClass);
-    Available &= Candidates;
+  // Use a temporary register if src and dst registers are the same.
+  if (DstReg == SrcReg)
+    TmpReg = scavengeGPR8(MI);
 
-    signed TmpReg = Available.find_first();
-    assert(TmpReg != -1 && "ran out of registers");
+  unsigned CurDstLoReg = (DstReg == SrcReg) ? TmpReg : DstLoReg;
+  unsigned CurDstHiReg = (DstReg == SrcReg) ? TmpReg : DstHiReg;
 
-    MIBLO = buildMI(MBB, MBBI, OpLo)
-      .addReg(TmpReg, RegState::Define)
-      .addReg(SrcReg)
-      .addImm(Imm);
+  // Load low byte.
+  auto MIBLO = buildMI(MBB, MBBI, OpLo)
+    .addReg(CurDstLoReg, RegState::Define)
+    .addReg(SrcReg)
+    .addImm(Imm);
 
-    buildMI(MBB, MBBI, AVR::MOVRdRr).addReg(DstLoReg).addReg(TmpReg);
+  // Push low byte onto stack if necessary.
+  if (TmpReg)
+    buildMI(MBB, MBBI, AVR::PUSHRr).addReg(TmpReg);
 
-    MIBHI = buildMI(MBB, MBBI, OpHi)
-      .addReg(TmpReg, RegState::Define)
-      .addReg(SrcReg, getKillRegState(SrcIsKill))
-      .addImm(Imm + 1);
+  // Load high byte.
+  auto MIBHI = buildMI(MBB, MBBI, OpHi)
+    .addReg(CurDstHiReg, RegState::Define)
+    .addReg(SrcReg, getKillRegState(SrcIsKill))
+    .addImm(Imm + 1);
 
+  if (TmpReg) {
+    // Move the high byte into the final destination.
     buildMI(MBB, MBBI, AVR::MOVRdRr).addReg(DstHiReg).addReg(TmpReg);
-  } else {
-    MIBLO = buildMI(MBB, MBBI, OpLo)
-      .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(SrcReg)
-      .addImm(Imm);
 
-    MIBHI = buildMI(MBB, MBBI, OpHi)
-      .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(SrcReg, getKillRegState(SrcIsKill))
-      .addImm(Imm + 1);
+    // Move the low byte from the scratch space into the final destination.
+    buildMI(MBB, MBBI, AVR::POPRd).addReg(DstLoReg);
   }
 
   MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
@@ -819,6 +816,32 @@ bool AVRExpandPseudo::expandAtomicArithmeticOp(unsigned Width,
   });
 }
 
+unsigned AVRExpandPseudo::scavengeGPR8(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  RegScavenger RS;
+
+  RS.enterBasicBlock(MBB);
+  RS.forward(MI);
+
+  BitVector Candidates =
+      TRI->getAllocatableSet
+      (*MBB.getParent(), &AVR::GPR8RegClass);
+
+  // Exclude all the registers being used by the instruction.
+  for (MachineOperand &MO : MI.operands()) {
+    if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() &&
+        !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      Candidates.reset(MO.getReg());
+  }
+
+  BitVector Available = RS.getRegsAvailable(&AVR::GPR8RegClass);
+  Available &= Candidates;
+
+  signed Reg = Available.find_first();
+  assert(Reg != -1 && "ran out of registers");
+  return Reg;
+}
+
 template<>
 bool AVRExpandPseudo::expand<AVR::AtomicLoad8>(Block &MBB, BlockIt MBBI) {
   return expandAtomicBinaryOp(AVR::LDRdPtr, MBB, MBBI);
@@ -948,7 +971,6 @@ bool AVRExpandPseudo::expand<AVR::STWPtrRr>(Block &MBB, BlockIt MBBI) {
   unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
   unsigned DstReg = MI.getOperand(0).getReg();
   unsigned SrcReg = MI.getOperand(1).getReg();
-  bool DstIsKill = MI.getOperand(0).isKill();
   bool SrcIsKill = MI.getOperand(1).isKill();
   OpLo = AVR::STPtrRr;
   OpHi = AVR::STDPtrQRr;
@@ -960,7 +982,7 @@ bool AVRExpandPseudo::expand<AVR::STWPtrRr>(Block &MBB, BlockIt MBBI) {
     .addReg(SrcLoReg, getKillRegState(SrcIsKill));
 
   auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstReg, getKillRegState(DstIsKill))
+    .addReg(DstReg)
     .addImm(1)
     .addReg(SrcHiReg, getKillRegState(SrcIsKill));
 
diff --git a/lib/Target/AVR/AVRFrameLowering.cpp b/lib/Target/AVR/AVRFrameLowering.cpp
index b8cb2215ddb4..ab42a7aa9901 100644
--- a/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/lib/Target/AVR/AVRFrameLowering.cpp
@@ -239,7 +239,7 @@ bool AVRFrameLowering::spillCalleeSavedRegisters(
     unsigned Reg = CSI[i - 1].getReg();
     bool IsNotLiveIn = !MBB.isLiveIn(Reg);
 
-    assert(TRI->getMinimalPhysRegClass(Reg)->getSize() == 1 &&
+    assert(TRI->getRegSizeInBits(*TRI->getMinimalPhysRegClass(Reg)) == 8 &&
            "Invalid register size");
 
     // Add the callee-saved register as live-in only if it is not already a
@@ -277,7 +277,7 @@ bool AVRFrameLowering::restoreCalleeSavedRegisters(
   for (const CalleeSavedInfo &CCSI : CSI) {
     unsigned Reg = CCSI.getReg();
 
-    assert(TRI->getMinimalPhysRegClass(Reg)->getSize() == 1 &&
+    assert(TRI->getRegSizeInBits(*TRI->getMinimalPhysRegClass(Reg)) == 8 &&
            "Invalid register size");
 
     BuildMI(MBB, MI, DL, TII.get(AVR::POPRd), Reg);
diff --git a/lib/Target/AVR/AVRInstrInfo.cpp b/lib/Target/AVR/AVRInstrInfo.cpp
index 88f889260cce..afba66b2e69b 100644
--- a/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/lib/Target/AVR/AVRInstrInfo.cpp
@@ -142,9 +142,9 @@ void AVRInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       MFI.getObjectAlignment(FrameIndex));
 
   unsigned Opcode = 0;
-  if (RC->hasType(MVT::i8)) {
+  if (TRI->isTypeLegalForClass(*RC, MVT::i8)) {
     Opcode = AVR::STDPtrQRr;
-  } else if (RC->hasType(MVT::i16)) {
+  } else if (TRI->isTypeLegalForClass(*RC, MVT::i16)) {
     Opcode = AVR::STDWPtrQRr;
   } else {
     llvm_unreachable("Cannot store this register into a stack slot!");
@@ -176,9 +176,9 @@ void AVRInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       MFI.getObjectAlignment(FrameIndex));
 
   unsigned Opcode = 0;
-  if (RC->hasType(MVT::i8)) {
+  if (TRI->isTypeLegalForClass(*RC, MVT::i8)) {
     Opcode = AVR::LDDRdPtrQ;
-  } else if (RC->hasType(MVT::i16)) {
+  } else if (TRI->isTypeLegalForClass(*RC, MVT::i16)) {
     // Opcode = AVR::LDDWRdPtrQ;
     //:FIXME: remove this once PR13375 gets fixed
     Opcode = AVR::LDDWRdYQ;
diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp
index 48798bd4a1da..5cc7eaf8add3 100644
--- a/lib/Target/AVR/AVRRegisterInfo.cpp
+++ b/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -78,11 +78,12 @@ BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 const TargetRegisterClass *
 AVRRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
                                            const MachineFunction &MF) const {
-  if (RC->hasType(MVT::i16)) {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  if (TRI->isTypeLegalForClass(*RC, MVT::i16)) {
     return &AVR::DREGSRegClass;
   }
 
-  if (RC->hasType(MVT::i8)) {
+  if (TRI->isTypeLegalForClass(*RC, MVT::i8)) {
     return &AVR::GPR8RegClass;
   }
 
diff --git a/lib/Target/Hexagon/BitTracker.cpp b/lib/Target/Hexagon/BitTracker.cpp
index cb3049bf1500..07767d1037a9 100644
--- a/lib/Target/Hexagon/BitTracker.cpp
+++ b/lib/Target/Hexagon/BitTracker.cpp
@@ -347,7 +347,7 @@ uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const {
 
   unsigned PhysS = (RR.Sub == 0) ? PhysR : TRI.getSubReg(PhysR, RR.Sub);
   const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(PhysS);
-  uint16_t BW = RC->getSize()*8;
+  uint16_t BW = TRI.getRegSizeInBits(*RC);
   return BW;
 }
 
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index fda23f8f6b05..c8483f7e6e76 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -286,9 +286,9 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
   const MCRegisterInfo *RI = OutStreamer->getContext().getRegisterInfo();
   const MachineFunction &MF = *MI.getParent()->getParent();
   const auto &HST = MF.getSubtarget<HexagonSubtarget>();
-  unsigned VectorSize = HST.useHVXSglOps()
-                            ? Hexagon::VectorRegsRegClass.getSize()
-                            : Hexagon::VectorRegs128BRegClass.getSize();
+  const auto &VecRC = HST.useHVXSglOps() ? Hexagon::VectorRegsRegClass
+                                         : Hexagon::VectorRegs128BRegClass;
+  unsigned VectorSize = HST.getRegisterInfo()->getSpillSize(VecRC);
 
   switch (Inst.getOpcode()) {
   default: return;
diff --git a/lib/Target/Hexagon/HexagonBitSimplify.cpp b/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 61f290ca98d7..8502bf24c02f 100644
--- a/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -407,7 +407,7 @@ bool HexagonBitSimplify::getSubregMask(const BitTracker::RegisterRef &RR,
   const TargetRegisterClass *RC = MRI.getRegClass(RR.Reg);
   if (RR.Sub == 0) {
     Begin = 0;
-    Width = RC->getSize()*8;
+    Width = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC);
     return true;
   }
 
@@ -417,7 +417,7 @@ bool HexagonBitSimplify::getSubregMask(const BitTracker::RegisterRef &RR,
     case Hexagon::DoubleRegsRegClassID:
     case Hexagon::VecDblRegsRegClassID:
     case Hexagon::VecDblRegs128BRegClassID:
-      Width = RC->getSize()*8 / 2;
+      Width = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 2;
       if (RR.Sub == Hexagon::isub_hi || RR.Sub == Hexagon::vsub_hi)
         Begin = Width;
       break;
@@ -1054,8 +1054,8 @@ namespace {
   class RedundantInstrElimination : public Transformation {
   public:
     RedundantInstrElimination(BitTracker &bt, const HexagonInstrInfo &hii,
-          MachineRegisterInfo &mri)
-        : Transformation(true), HII(hii), MRI(mri), BT(bt) {}
+          const HexagonRegisterInfo &hri, MachineRegisterInfo &mri)
+        : Transformation(true), HII(hii), HRI(hri), MRI(mri), BT(bt) {}
 
     bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
 
@@ -1070,6 +1070,7 @@ namespace {
     bool usedBitsEqual(BitTracker::RegisterRef RD, BitTracker::RegisterRef RS);
 
     const HexagonInstrInfo &HII;
+    const HexagonRegisterInfo &HRI;
     MachineRegisterInfo &MRI;
     BitTracker &BT;
   };
@@ -1262,7 +1263,7 @@ bool RedundantInstrElimination::computeUsedBits(const MachineInstr &MI,
     assert(MI.getOperand(OpN).isReg());
     BitTracker::RegisterRef RR = MI.getOperand(OpN);
     const TargetRegisterClass *RC = HBS::getFinalVRegClass(RR, MRI);
-    uint16_t Width = RC->getSize()*8;
+    uint16_t Width = HRI.getRegSizeInBits(*RC);
 
     if (!GotBits)
       T.set(Begin, Begin+Width);
@@ -2173,8 +2174,10 @@ bool BitSimplification::genBitSplit(MachineInstr *MI,
       const RegisterSet &AVs) {
   if (!GenBitSplit)
     return false;
-  if (CountBitSplit >= MaxBitSplit)
-    return false;
+  if (MaxBitSplit.getNumOccurrences()) {
+    if (CountBitSplit >= MaxBitSplit)
+      return false;
+  }
 
   unsigned Opc = MI->getOpcode();
   switch (Opc) {
@@ -2253,7 +2256,8 @@ bool BitSimplification::genBitSplit(MachineInstr *MI,
       continue;
 
     // Generate bitsplit where S is defined.
-    CountBitSplit++;
+    if (MaxBitSplit.getNumOccurrences())
+      CountBitSplit++;
     MachineInstr *DefS = MRI.getVRegDef(S);
     assert(DefS != nullptr);
     DebugLoc DL = DefS->getDebugLoc();
@@ -2379,9 +2383,11 @@ bool BitSimplification::simplifyExtractLow(MachineInstr *MI,
       const RegisterSet &AVs) {
   if (!GenExtract)
     return false;
-  if (CountExtract >= MaxExtract)
-    return false;
-  CountExtract++;
+  if (MaxExtract.getNumOccurrences()) {
+    if (CountExtract >= MaxExtract)
+      return false;
+    CountExtract++;
+  }
 
   unsigned W = RC.width();
   unsigned RW = W;
@@ -2651,7 +2657,7 @@ bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) {
   Changed |= visitBlock(Entry, ImmG, AIG);
 
   RegisterSet ARE;  // Available registers for RIE.
-  RedundantInstrElimination RIE(BT, HII, MRI);
+  RedundantInstrElimination RIE(BT, HII, HRI, MRI);
   bool Ried = visitBlock(Entry, RIE, ARE);
   if (Ried) {
     Changed = true;
diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index d8ba5dcd35ad..9f8c9ded8127 100644
--- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -559,10 +559,10 @@ unsigned HexagonExpandCondsets::getCondTfrOpcode(const MachineOperand &SO,
     }
     unsigned PhysS = (RS.Sub == 0) ? PhysR : TRI->getSubReg(PhysR, RS.Sub);
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysS);
-    switch (RC->getSize()) {
-      case 4:
+    switch (TRI->getRegSizeInBits(*RC)) {
+      case 32:
         return IfTrue ? A2_tfrt : A2_tfrf;
-      case 8:
+      case 64:
         return IfTrue ? A2_tfrpt : A2_tfrpf;
     }
     llvm_unreachable("Invalid register operand");
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 0e2380f4316a..a04aca4afa0f 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -1425,7 +1425,7 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
     if (!SRegs[S->Reg])
       continue;
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(S->Reg);
-    int FI = MFI.CreateFixedSpillStackObject(RC->getSize(), S->Offset);
+    int FI = MFI.CreateFixedSpillStackObject(TRI->getSpillSize(*RC), S->Offset);
     MinOffset = std::min(MinOffset, S->Offset);
     CSI.push_back(CalleeSavedInfo(S->Reg, FI));
     SRegs[S->Reg] = false;
@@ -1437,11 +1437,12 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
   for (int x = SRegs.find_first(); x >= 0; x = SRegs.find_next(x)) {
     unsigned R = x;
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(R);
-    int Off = MinOffset - RC->getSize();
-    unsigned Align = std::min(RC->getAlignment(), getStackAlignment());
+    unsigned Size = TRI->getSpillSize(*RC);
+    int Off = MinOffset - Size;
+    unsigned Align = std::min(TRI->getSpillAlignment(*RC), getStackAlignment());
     assert(isPowerOf2_32(Align));
     Off &= -Align;
-    int FI = MFI.CreateFixedSpillStackObject(RC->getSize(), Off);
+    int FI = MFI.CreateFixedSpillStackObject(Size, Off);
     MinOffset = std::min(MinOffset, Off);
     CSI.push_back(CalleeSavedInfo(R, FI));
     SRegs[R] = false;
@@ -1677,10 +1678,10 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B,
   int FI = MI->getOperand(0).getIndex();
 
   bool Is128B = HST.useHVXDblOps();
-  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
-                     : &Hexagon::VectorRegs128BRegClass;
-  unsigned Size = RC->getSize();
-  unsigned NeedAlign = RC->getAlignment();
+  const auto &RC = !Is128B ? Hexagon::VectorRegsRegClass
+                           : Hexagon::VectorRegs128BRegClass;
+  unsigned Size = HRI.getSpillSize(RC);
+  unsigned NeedAlign = HRI.getSpillAlignment(RC);
   unsigned HasAlign = MFI.getObjectAlignment(FI);
   unsigned StoreOpc;
 
@@ -1734,10 +1735,10 @@ bool HexagonFrameLowering::expandLoadVec2(MachineBasicBlock &B,
   int FI = MI->getOperand(1).getIndex();
 
   bool Is128B = HST.useHVXDblOps();
-  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
-                     : &Hexagon::VectorRegs128BRegClass;
-  unsigned Size = RC->getSize();
-  unsigned NeedAlign = RC->getAlignment();
+  const auto &RC = !Is128B ? Hexagon::VectorRegsRegClass
+                           : Hexagon::VectorRegs128BRegClass;
+  unsigned Size = HRI.getSpillSize(RC);
+  unsigned NeedAlign = HRI.getSpillAlignment(RC);
   unsigned HasAlign = MFI.getObjectAlignment(FI);
   unsigned LoadOpc;
 
@@ -1777,16 +1778,16 @@ bool HexagonFrameLowering::expandStoreVec(MachineBasicBlock &B,
   if (!MI->getOperand(0).isFI())
     return false;
 
+  auto &HRI = *HST.getRegisterInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned SrcR = MI->getOperand(2).getReg();
   bool IsKill = MI->getOperand(2).isKill();
   int FI = MI->getOperand(0).getIndex();
 
   bool Is128B = HST.useHVXDblOps();
-  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
-                     : &Hexagon::VectorRegs128BRegClass;
-
-  unsigned NeedAlign = RC->getAlignment();
+  const auto &RC = !Is128B ? Hexagon::VectorRegsRegClass
+                           : Hexagon::VectorRegs128BRegClass;
+  unsigned NeedAlign = HRI.getSpillAlignment(RC);
   unsigned HasAlign = MFI.getObjectAlignment(FI);
   unsigned StoreOpc;
 
@@ -1815,15 +1816,15 @@ bool HexagonFrameLowering::expandLoadVec(MachineBasicBlock &B,
   if (!MI->getOperand(1).isFI())
     return false;
 
+  auto &HRI = *HST.getRegisterInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned DstR = MI->getOperand(0).getReg();
   int FI = MI->getOperand(1).getIndex();
 
   bool Is128B = HST.useHVXDblOps();
-  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
-                     : &Hexagon::VectorRegs128BRegClass;
-
-  unsigned NeedAlign = RC->getAlignment();
+  const auto &RC = !Is128B ? Hexagon::VectorRegsRegClass
+                           : Hexagon::VectorRegs128BRegClass;
+  unsigned NeedAlign = HRI.getSpillAlignment(RC);
   unsigned HasAlign = MFI.getObjectAlignment(FI);
   unsigned LoadOpc;
 
@@ -1932,7 +1933,7 @@ void HexagonFrameLowering::determineCalleeSaves(MachineFunction &MF,
       if (!needToReserveScavengingSpillSlots(MF, HRI, RC))
         continue;
       unsigned Num = RC == &Hexagon::IntRegsRegClass ? NumberScavengerSlots : 1;
-      unsigned S = RC->getSize(), A = RC->getAlignment();
+      unsigned S = HRI.getSpillSize(*RC), A = HRI.getSpillAlignment(*RC);
       for (unsigned i = 0; i < Num; i++) {
         int NewFI = MFI.CreateSpillStackObject(S, A);
         RS->addScavengingFrameIndex(NewFI);
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index b5948475e1f7..1829c5da02a6 100644
--- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include <algorithm>
@@ -1206,10 +1207,9 @@ bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V,
   if (!T)
     return false;
 
-  unsigned BW = T->getBitWidth();
-  APInt K0(BW, 0), K1(BW, 0);
-  computeKnownBits(V, K0, K1, DL);
-  return K0.countLeadingOnes() >= IterCount;
+  KnownBits Known(T->getBitWidth());
+  computeKnownBits(V, Known, DL);
+  return Known.Zero.countLeadingOnes() >= IterCount;
 }
 
 
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index c0c29b992238..22fc2474fae6 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -122,6 +122,7 @@ def ADJCALLSTACKUP   : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
 }
 
 let usesCustomInserter = 1 in {
+  let Uses = [SR] in {
   def Select8  : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$src2, i8imm:$cc),
                         "# Select8 PSEUDO",
                         [(set GR8:$dst,
@@ -130,6 +131,7 @@ let usesCustomInserter = 1 in {
                         "# Select16 PSEUDO",
                         [(set GR16:$dst,
                           (MSP430selectcc GR16:$src, GR16:$src2, imm:$cc))]>;
+  }
   let Defs = [SR] in {
   def Shl8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
                         "# Shl8 PSEUDO",
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 2a9d96205eb9..134f7ac3aea3 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -273,9 +273,9 @@ void MipsAsmPrinter::printSavedRegsBitmask() {
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
   // size of stack area to which FP callee-saved regs are saved.
-  unsigned CPURegSize = Mips::GPR32RegClass.getSize();
-  unsigned FGR32RegSize = Mips::FGR32RegClass.getSize();
-  unsigned AFGR64RegSize = Mips::AFGR64RegClass.getSize();
+  unsigned CPURegSize = TRI->getRegSizeInBits(Mips::GPR32RegClass) / 8;
+  unsigned FGR32RegSize = TRI->getRegSizeInBits(Mips::FGR32RegClass) / 8;
+  unsigned AFGR64RegSize = TRI->getRegSizeInBits(Mips::AFGR64RegClass) / 8;
   bool HasAFGR64Reg = false;
   unsigned CSFPRegsSize = 0;
 
diff --git a/lib/Target/Mips/MipsCCState.cpp b/lib/Target/Mips/MipsCCState.cpp
index 7af988c1f64d..cb9f676c237a 100644
--- a/lib/Target/Mips/MipsCCState.cpp
+++ b/lib/Target/Mips/MipsCCState.cpp
@@ -38,7 +38,7 @@ static bool isF128SoftLibCall(const char *CallSym) {
 
 /// This function returns true if Ty is fp128, {f128} or i128 which was
 /// originally a fp128.
-static bool originalTypeIsF128(Type *Ty, const SDNode *CallNode) {
+static bool originalTypeIsF128(const Type *Ty, const char *Func) {
   if (Ty->isFP128Ty())
     return true;
 
@@ -46,12 +46,9 @@ static bool originalTypeIsF128(Type *Ty, const SDNode *CallNode) {
       Ty->getStructElementType(0)->isFP128Ty())
     return true;
 
-  const ExternalSymbolSDNode *ES =
-      dyn_cast_or_null<const ExternalSymbolSDNode>(CallNode);
-
   // If the Ty is i128 and the function being called is a long double emulation
   // routine, then the original type is f128.
-  return (ES && Ty->isIntegerTy(128) && isF128SoftLibCall(ES->getSymbol()));
+  return (Func && Ty->isIntegerTy(128) && isF128SoftLibCall(Func));
 }
 
 MipsCCState::SpecialCallingConvType
@@ -73,11 +70,11 @@ MipsCCState::getSpecialCallingConvForCallee(const SDNode *Callee,
 
 void MipsCCState::PreAnalyzeCallResultForF128(
     const SmallVectorImpl<ISD::InputArg> &Ins,
-    const TargetLowering::CallLoweringInfo &CLI) {
+    const Type *RetTy, const char *Call) {
   for (unsigned i = 0; i < Ins.size(); ++i) {
     OriginalArgWasF128.push_back(
-        originalTypeIsF128(CLI.RetTy, CLI.Callee.getNode()));
-    OriginalArgWasFloat.push_back(CLI.RetTy->isFloatingPointTy());
+        originalTypeIsF128(RetTy, Call));
+    OriginalArgWasFloat.push_back(RetTy->isFloatingPointTy());
   }
 }
 
@@ -99,10 +96,10 @@ void MipsCCState::PreAnalyzeReturnForF128(
 void MipsCCState::PreAnalyzeCallOperands(
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     std::vector<TargetLowering::ArgListEntry> &FuncArgs,
-    const SDNode *CallNode) {
+    const char *Func) {
   for (unsigned i = 0; i < Outs.size(); ++i) {
     OriginalArgWasF128.push_back(
-        originalTypeIsF128(FuncArgs[Outs[i].OrigArgIndex].Ty, CallNode));
+        originalTypeIsF128(FuncArgs[Outs[i].OrigArgIndex].Ty, Func));
     OriginalArgWasFloat.push_back(
         FuncArgs[Outs[i].OrigArgIndex].Ty->isFloatingPointTy());
     CallOperandIsFixed.push_back(Outs[i].IsFixed);
diff --git a/lib/Target/Mips/MipsCCState.h b/lib/Target/Mips/MipsCCState.h
index 081c393a09be..77ecc65b2eee 100644
--- a/lib/Target/Mips/MipsCCState.h
+++ b/lib/Target/Mips/MipsCCState.h
@@ -31,7 +31,7 @@ private:
   /// Identify lowered values that originated from f128 arguments and record
   /// this for use by RetCC_MipsN.
   void PreAnalyzeCallResultForF128(const SmallVectorImpl<ISD::InputArg> &Ins,
-                                   const TargetLowering::CallLoweringInfo &CLI);
+                                   const Type *RetTy, const char * Func);
 
   /// Identify lowered values that originated from f128 arguments and record
   /// this for use by RetCC_MipsN.
@@ -42,7 +42,7 @@ private:
   void
   PreAnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
                          std::vector<TargetLowering::ArgListEntry> &FuncArgs,
-                         const SDNode *CallNode);
+                         const char *Func);
 
   /// Identify lowered values that originated from f128 arguments and record
   /// this.
@@ -73,8 +73,8 @@ public:
   AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
                       CCAssignFn Fn,
                       std::vector<TargetLowering::ArgListEntry> &FuncArgs,
-                      const SDNode *CallNode) {
-    PreAnalyzeCallOperands(Outs, FuncArgs, CallNode);
+                      const char *Func) {
+    PreAnalyzeCallOperands(Outs, FuncArgs, Func);
     CCState::AnalyzeCallOperands(Outs, Fn);
     OriginalArgWasF128.clear();
     OriginalArgWasFloat.clear();
@@ -99,9 +99,9 @@ public:
   }
 
   void AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
-                         CCAssignFn Fn,
-                         const TargetLowering::CallLoweringInfo &CLI) {
-    PreAnalyzeCallResultForF128(Ins, CLI);
+                         CCAssignFn Fn, const Type *RetTy,
+                         const char *Func) {
+    PreAnalyzeCallResultForF128(Ins, RetTy, Func);
     CCState::AnalyzeCallResult(Ins, Fn);
     OriginalArgWasFloat.clear();
     OriginalArgWasF128.clear();
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index c060cf06099d..a5c7bf7699ea 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -1260,8 +1260,10 @@ bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
   emitInst(Mips::ADJCALLSTACKUP).addImm(16).addImm(0);
   if (RetVT != MVT::isVoid) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
-    CCInfo.AnalyzeCallResult(RetVT, RetCC_Mips);
+    MipsCCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
+
+    CCInfo.AnalyzeCallResult(CLI.Ins, RetCC_Mips, CLI.RetTy,
+                             CLI.Symbol->getName().data());
 
     // Only handle a single return value.
     if (RVLocs.size() != 1)
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index b2cf03976f81..ef05166503b2 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -119,7 +119,7 @@ uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const {
 
   // Conservatively assume all callee-saved registers will be saved.
   for (const MCPhysReg *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) {
-    unsigned Size = TRI.getMinimalPhysRegClass(*R)->getSize();
+    unsigned Size = TRI.getSpillSize(*TRI.getMinimalPhysRegClass(*R));
     Offset = alignTo(Offset + Size, Size);
   }
 
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 93c5f496ce97..8f39ebd42a5c 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -2750,7 +2750,10 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // caller side but removing it breaks the frame size calculation.
   CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
 
-  CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(), Callee.getNode());
+  const ExternalSymbolSDNode *ES =
+      dyn_cast_or_null<const ExternalSymbolSDNode>(Callee.getNode());
+  CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(),
+                             ES ? ES->getSymbol() : nullptr);
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NextStackOffset = CCInfo.getNextStackOffset();
@@ -2985,7 +2988,11 @@ SDValue MipsTargetLowering::LowerCallResult(
   SmallVector<CCValAssign, 16> RVLocs;
   MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
                      *DAG.getContext());
-  CCInfo.AnalyzeCallResult(Ins, RetCC_Mips, CLI);
+
+  const ExternalSymbolSDNode *ES =
+      dyn_cast_or_null<const ExternalSymbolSDNode>(CLI.Callee.getNode());
+  CCInfo.AnalyzeCallResult(Ins, RetCC_Mips, CLI.RetTy,
+                           ES ? ES->getSymbol() : nullptr);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp
index 5bf4c958c7b9..63034ecab93b 100644
--- a/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/lib/Target/Mips/MipsMachineFunction.cpp
@@ -40,11 +40,7 @@ unsigned MipsFunctionInfo::getGlobalBaseReg() {
   const TargetRegisterClass *RC =
       STI.inMips16Mode()
           ? &Mips::CPU16RegsRegClass
-          : STI.inMicroMipsMode()
-                ? STI.hasMips64()
-                      ? &Mips::GPRMM16_64RegClass
-                      : &Mips::GPRMM16RegClass
-                : static_cast<const MipsTargetMachine &>(MF.getTarget())
+          : static_cast<const MipsTargetMachine &>(MF.getTarget())
                           .getABI()
                           .IsN64()
                       ? &Mips::GPR64RegClass
@@ -53,14 +49,15 @@ unsigned MipsFunctionInfo::getGlobalBaseReg() {
 }
 
 void MipsFunctionInfo::createEhDataRegsFI() {
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   for (int I = 0; I < 4; ++I) {
-    const TargetRegisterClass *RC =
+    const TargetRegisterClass &RC =
         static_cast<const MipsTargetMachine &>(MF.getTarget()).getABI().IsN64()
-            ? &Mips::GPR64RegClass
-            : &Mips::GPR32RegClass;
+            ? Mips::GPR64RegClass
+            : Mips::GPR32RegClass;
 
-    EhDataRegFI[I] = MF.getFrameInfo().CreateStackObject(RC->getSize(),
-        RC->getAlignment(), false);
+    EhDataRegFI[I] = MF.getFrameInfo().CreateStackObject(TRI.getSpillSize(RC),
+        TRI.getSpillAlignment(RC), false);
   }
 }
 
@@ -69,11 +66,12 @@ void MipsFunctionInfo::createISRRegFI() {
   // The current implementation only supports Mips32r2+ not Mips64rX. Status
   // is always 32 bits, ErrorPC is 32 or 64 bits dependent on architecture,
   // however Mips32r2+ is the supported architecture.
-  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  const TargetRegisterClass &RC = Mips::GPR32RegClass;
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
 
   for (int I = 0; I < 2; ++I)
     ISRDataRegFI[I] = MF.getFrameInfo().CreateStackObject(
-        RC->getSize(), RC->getAlignment(), false);
+        TRI.getSpillSize(RC), TRI.getSpillAlignment(RC), false);
 }
 
 bool MipsFunctionInfo::isEhDataRegFI(int FI) const {
@@ -93,9 +91,10 @@ MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *GV) {
 }
 
 int MipsFunctionInfo::getMoveF64ViaSpillFI(const TargetRegisterClass *RC) {
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   if (MoveF64ViaSpillFI == -1) {
     MoveF64ViaSpillFI = MF.getFrameInfo().CreateStackObject(
-        RC->getSize(), RC->getAlignment(), false);
+        TRI.getSpillSize(*RC), TRI.getSpillAlignment(*RC), false);
   }
   return MoveF64ViaSpillFI;
 }
diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp
index f33857fe628f..68dcbdfb4211 100644
--- a/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -116,9 +116,10 @@ static MachineOperand *getCallTargetRegOpnd(MachineInstr &MI) {
 
 /// Return type of register Reg.
 static MVT::SimpleValueType getRegTy(unsigned Reg, MachineFunction &MF) {
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(Reg);
-  assert(RC->vt_end() - RC->vt_begin() == 1);
-  return *RC->vt_begin();
+  assert(TRI.legalclasstypes_end(*RC) - TRI.legalclasstypes_begin(*RC) == 1);
+  return *TRI.legalclasstypes_begin(*RC);
 }
 
 /// Do the following transformation:
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index ef8d18c6deb1..e765b4625206 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -260,7 +260,8 @@ bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I,
   //  copy dst_hi, $vr1
 
   unsigned Dst = I->getOperand(0).getReg(), Src = I->getOperand(1).getReg();
-  unsigned VRegSize = RegInfo.getMinimalPhysRegClass(Dst)->getSize() / 2;
+  const TargetRegisterClass *DstRC = RegInfo.getMinimalPhysRegClass(Dst);
+  unsigned VRegSize = RegInfo.getRegSizeInBits(*DstRC) / 16;
   const TargetRegisterClass *RC = RegInfo.intRegClass(VRegSize);
   unsigned VR0 = MRI.createVirtualRegister(RC);
   unsigned VR1 = MRI.createVirtualRegister(RC);
@@ -858,6 +859,7 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                                BitVector &SavedRegs,
                                                RegScavenger *RS) const {
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
   MipsABIInfo ABI = STI.getABI();
   unsigned FP = ABI.GetFramePtr();
@@ -883,10 +885,11 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (ExpandPseudo(MF).expand()) {
     // The spill slot should be half the size of the accumulator. If target is
     // mips64, it should be 64-bit, otherwise it should be 32-bt.
-    const TargetRegisterClass *RC = STI.hasMips64() ?
-      &Mips::GPR64RegClass : &Mips::GPR32RegClass;
-    int FI = MF.getFrameInfo().CreateStackObject(RC->getSize(),
-                                                  RC->getAlignment(), false);
+    const TargetRegisterClass &RC = STI.hasMips64() ?
+      Mips::GPR64RegClass : Mips::GPR32RegClass;
+    int FI = MF.getFrameInfo().CreateStackObject(TRI->getSpillSize(RC),
+                                                 TRI->getSpillAlignment(RC),
+                                                 false);
     RS->addScavengingFrameIndex(FI);
   }
 
@@ -897,10 +900,11 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (isInt<16>(MaxSPOffset))
     return;
 
-  const TargetRegisterClass *RC =
-      ABI.ArePtrs64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
-  int FI = MF.getFrameInfo().CreateStackObject(RC->getSize(),
-                                                RC->getAlignment(), false);
+  const TargetRegisterClass &RC =
+      ABI.ArePtrs64bit() ? Mips::GPR64RegClass : Mips::GPR32RegClass;
+  int FI = MF.getFrameInfo().CreateStackObject(TRI->getSpillSize(RC),
+                                               TRI->getSpillAlignment(RC),
+                                               false);
   RS->addScavengingFrameIndex(FI);
 }
 
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index 91e712a7a54e..ee074798563d 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -207,13 +207,16 @@ storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     Opc = Mips::SDC1;
   else if (Mips::FGR64RegClass.hasSubClassEq(RC))
     Opc = Mips::SDC164;
-  else if (RC->hasType(MVT::v16i8))
+  else if (TRI->isTypeLegalForClass(*RC, MVT::v16i8))
     Opc = Mips::ST_B;
-  else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16))
+  else if (TRI->isTypeLegalForClass(*RC, MVT::v8i16) ||
+           TRI->isTypeLegalForClass(*RC, MVT::v8f16))
     Opc = Mips::ST_H;
-  else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32))
+  else if (TRI->isTypeLegalForClass(*RC, MVT::v4i32) ||
+           TRI->isTypeLegalForClass(*RC, MVT::v4f32))
     Opc = Mips::ST_W;
-  else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
+  else if (TRI->isTypeLegalForClass(*RC, MVT::v2i64) ||
+           TRI->isTypeLegalForClass(*RC, MVT::v2f64))
     Opc = Mips::ST_D;
   else if (Mips::LO32RegClass.hasSubClassEq(RC))
     Opc = Mips::SW;
@@ -280,13 +283,16 @@ loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     Opc = Mips::LDC1;
   else if (Mips::FGR64RegClass.hasSubClassEq(RC))
     Opc = Mips::LDC164;
-  else if (RC->hasType(MVT::v16i8))
+  else if (TRI->isTypeLegalForClass(*RC, MVT::v16i8))
     Opc = Mips::LD_B;
-  else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16))
+  else if (TRI->isTypeLegalForClass(*RC, MVT::v8i16) ||
+           TRI->isTypeLegalForClass(*RC, MVT::v8f16))
     Opc = Mips::LD_H;
-  else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32))
+  else if (TRI->isTypeLegalForClass(*RC, MVT::v4i32) ||
+           TRI->isTypeLegalForClass(*RC, MVT::v4f32))
     Opc = Mips::LD_W;
-  else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
+  else if (TRI->isTypeLegalForClass(*RC, MVT::v2i64) ||
+           TRI->isTypeLegalForClass(*RC, MVT::v2f64))
     Opc = Mips::LD_D;
   else if (Mips::HI32RegClass.hasSubClassEq(RC))
     Opc = Mips::LW;
@@ -567,8 +573,8 @@ MipsSEInstrInfo::compareOpndSize(unsigned Opc,
   const MCInstrDesc &Desc = get(Opc);
   assert(Desc.NumOperands == 2 && "Unary instruction expected.");
   const MipsRegisterInfo *RI = &getRegisterInfo();
-  unsigned DstRegSize = getRegClass(Desc, 0, RI, MF)->getSize();
-  unsigned SrcRegSize = getRegClass(Desc, 1, RI, MF)->getSize();
+  unsigned DstRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 0, RI, MF));
+  unsigned SrcRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 1, RI, MF));
 
   return std::make_pair(DstRegSize > SrcRegSize, DstRegSize < SrcRegSize);
 }
diff --git a/lib/Target/Mips/Relocation.txt b/lib/Target/Mips/Relocation.txt
new file mode 100644
index 000000000000..f1a6fd8645f6
--- /dev/null
+++ b/lib/Target/Mips/Relocation.txt
@@ -0,0 +1,125 @@
+MIPS Relocation Principles
+
+In LLVM, there are several elements of the llvm::ISD::NodeType enum
+that deal with addresses and/or relocations. These are defined in
+include/llvm/Target/TargetSelectionDAG.td, namely:
+    GlobalAddress, GlobalTLSAddress, JumpTable, ConstantPool,
+    ExternalSymbol, BlockAddress
+The MIPS backend uses several principles to handle these.
+
+1. Code for lowering addresses references to machine dependent code is
+factored into common code for generating different address forms and
+is called by the relocation model specific lowering function, using
+templated functions. For example:
+
+  // lib/Target/Mips/MipsISelLowering.cpp
+  SDValue MipsTargetLowering::
+  lowerJumpTable(SDValue Op, SelectionDAG &DAG) const
+
+calls
+
+  template <class NodeTy> // lib/Target/Mips/MipsISelLowering.h
+  SDValue getAddrLocal(NodeTy *N, const SDLoc &DL, EVT Ty,
+                       SelectionDAG &DAG, bool IsN32OrN64) const
+
+which calls the overloaded function:
+
+  // lib/Target/Mips/MipsISelLowering.h
+  SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG,
+                        unsigned Flag) const;
+
+2. Generic address nodes are lowered to some combination of target
+independent and machine specific SDNodes (for example:
+MipsISD::{Highest, Higher, Hi, Lo}) depending upon relocation model,
+ABI, and compilation options.
+
+The choice of specific instructions that are to be used is delegated
+to ISel which in turn relies on TableGen patterns to choose subtarget
+specific instructions. For example, in getAddrLocal, the pseudo-code
+generated is:
+
+  (add (load (wrapper $gp, %got(sym)), %lo(sym))
+
+where "%lo" represents an instance of an SDNode with opcode
+"MipsISD::Lo", "wrapper" indicates one with opcode "MipsISD::Wrapper",
+and "%got" the global table pointer "getGlobalReg(...)". The "add" is
+"ISD::ADD", not a target dependent one.
+
+3. A TableGen multiclass pattern "MipsHiLoRelocs" is used to define a
+template pattern parameterized over the load upper immediate
+instruction, add operation, the zero register, and register class.
+Here the instantiation of MipsHiLoRelocs in MipsInstrInfo.td is used
+to MIPS32 to compute addresses for the static relocation model.
+
+  // lib/Target/Mips/MipsInstrInfo.td
+  multiclass MipsHiLoRelocs<Instruction Lui, Instruction Addiu,
+                            Register ZeroReg, RegisterOperand GPROpnd> {
+    def : MipsPat<(MipsHi tglobaladdr:$in), (Lui tglobaladdr:$in)>;
+    ...
+    def : MipsPat<(MipsLo tglobaladdr:$in), (Addiu ZeroReg, tglobaladdr:$in)>;
+    ...
+    def : MipsPat<(add GPROpnd:$hi, (MipsLo tglobaladdr:$lo)),
+                (Addiu GPROpnd:$hi, tglobaladdr:$lo)>;
+    ...
+  }
+  defm : MipsHiLoRelocs<LUi, ADDiu, ZERO, GPR32Opnd>;
+
+  // lib/Target/Mips/Mips64InstrInfo.td
+  defm : MipsHiLoRelocs<LUi64, DADDiu, ZERO_64, GPR64Opnd>, SYM_32;
+
+The instantiation in Mips64InstrInfo.td is used for MIPS64 in ILP32
+mode, as guarded by the predicate "SYM_32" and also for a submode of
+LP64 where symbols are assumed to be 32 bits wide. A similar
+multiclass for MIPS64 in LP64 mode is also defined:
+
+  // lib/Target/Mips/Mips64InstrInfo.td
+  multiclass MipsHighestHigherHiLoRelocs<Instruction Lui,
+                                         Instruction Daddiu> {
+  ...
+    def : MipsPat<(MipsHighest (i64 tglobaladdr:$in)),
+                  (Lui tglobaladdr:$in)>;
+  ...
+    def : MipsPat<(MipsHigher (i64 tglobaladdr:$in)),
+                  (Daddiu ZERO_64, tglobaladdr:$in)>;
+  ...
+    def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tglobaladdr:$lo))),
+                  (Daddiu GPR64:$hi, tglobaladdr:$lo)>;
+  ...
+    def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tglobaladdr:$lo))),
+                  (Daddiu GPR64:$hi, tglobaladdr:$lo)>;
+  ...
+    def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tglobaladdr:$lo))),
+                  (Daddiu GPR64:$hi, tglobaladdr:$lo)>;
+  }
+
+and it is instantiated twice:
+
+  // lib/Target/Mips/Mips64InstrInfo.td
+  defm : MipsHighestHigherHiLoRelocs<LUi64, DADDiu>, SYM_64;
+  // lib/Target/Mips/MicroMips64r6InstrInfo.td
+  defm : MipsHighestHigherHiLoRelocs<LUi64, DADDIU_MM64R6>, SYM_64,
+                                     ISA_MICROMIPS64R6;
+
+These patterns are used during instruction selection to match
+MipsISD::{Highest, Higher, Hi, Lo} to a specific machine instruction
+and operands.
+
+More details on how multiclasses in TableGen work can be found in the
+section "Multiclass definitions and instances" in the document
+"TableGen Language Introduction"
+
+4. Instruction definitions are multiply defined to cover the different
+register classes. In some cases, such as LW/LW64, this also accounts
+for the difference in the results of instruction execution. On MIPS32,
+"lw" loads a 32 bit value from memory. On MIPS64, "lw" loads a 32 bit
+value from memory and sign extends the value to 64 bits.
+
+  // lib/Target/Mips/MipsInstrInfo.td
+  def LUi   : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16_relaxed>, LUI_FM;
+  // lib/Target/Mips/Mips64InstrInfo.td
+  def LUi64   : LoadUpper<"lui", GPR64Opnd, uimm16_64_relaxed>, LUI_FM;
+
+defines two names "LUi" and "LUi64" with two different register
+classes, but with the same encoding---"LUI_FM". These instructions load a
+16-bit immediate into bits 31-16 and clear the lower 15 bits. On MIPS64,
+the result is sign-extended to 64 bits.
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 3026f0be242d..0f6c2e53e60a 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -38,7 +38,7 @@ void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg);
   const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
 
-  if (DestRC->getSize() != SrcRC->getSize())
+  if (RegInfo.getRegSizeInBits(*DestRC) != RegInfo.getRegSizeInBits(*SrcRC))
     report_fatal_error("Copy one register into another with a different width");
 
   unsigned Op;
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 4c9430a2eca0..2a402deccbca 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -1898,12 +1898,13 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
   MachineFrameInfo &MFI = MF.getFrameInfo();
   if (MFI.hasVarSizedObjects() || spillsCR(MF) || spillsVRSAVE(MF) ||
       hasNonRISpills(MF) || (hasSpills(MF) && !isInt<16>(StackSize))) {
-    const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
-    const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
-    const TargetRegisterClass *RC = Subtarget.isPPC64() ? G8RC : GPRC;
-    RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
-                                                      RC->getAlignment(),
-                                                      false));
+    const TargetRegisterClass &GPRC = PPC::GPRCRegClass;
+    const TargetRegisterClass &G8RC = PPC::G8RCRegClass;
+    const TargetRegisterClass &RC = Subtarget.isPPC64() ? G8RC : GPRC;
+    const TargetRegisterInfo &TRI = *Subtarget.getRegisterInfo();
+    unsigned Size = TRI.getSpillSize(RC);
+    unsigned Align = TRI.getSpillAlignment(RC);
+    RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false));
 
     // Might we have over-aligned allocas?
     bool HasAlVars = MFI.hasVarSizedObjects() &&
@@ -1911,9 +1912,7 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
 
     // These kinds of spills might need two registers.
     if (spillsCR(MF) || spillsVRSAVE(MF) || HasAlVars)
-      RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
-                                                        RC->getAlignment(),
-                                                        false));
+      RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false));
 
   }
 }
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index f7663d8e5185..4659a2ea8032 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -9057,6 +9057,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
                                     MachineBasicBlock *MBB) const {
   DebugLoc DL = MI.getDebugLoc();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
 
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -9070,7 +9071,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
 
   unsigned DstReg = MI.getOperand(0).getReg();
   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
-  assert(RC->hasType(MVT::i32) && "Invalid destination!");
+  assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
   unsigned mainDstReg = MRI.createVirtualRegister(RC);
   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
 
@@ -9153,7 +9154,6 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
 
   // Setup
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
-  const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
   MIB.addRegMask(TRI->getNoPreservedMask());
 
   BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 8e159f47ea2e..790a8902b3d2 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -440,8 +440,8 @@ void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB,
   BuildMI(MBB, MI, DL, get(Opcode));
 }
 
-/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
-void PPCInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+/// Return the noop instruction to use for a noop.
+void PPCInstrInfo::getNoop(MCInst &NopInst) const {
   NopInst.setOpcode(PPC::NOP);
 }
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index f11aed8fa268..b30d09e03ec4 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -269,7 +269,7 @@ public:
   ///
   unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
 
-  void getNoopForMachoTarget(MCInst &NopInst) const override;
+  void getNoop(MCInst &NopInst) const override;
 
   std::pair<unsigned, unsigned>
   decomposeMachineOperandsTargetFlags(unsigned TF) const override;
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 455d1ee1564a..f120a98e9457 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -3234,6 +3234,7 @@ SparcTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
                                       MachineBasicBlock *MBB) const {
   DebugLoc DL = MI.getDebugLoc();
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
 
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -3245,7 +3246,8 @@ SparcTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
 
   unsigned DstReg = MI.getOperand(0).getReg();
   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
-  assert(RC->hasType(MVT::i32) && "Invalid destination!");
+  assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
+  (void)TRI;
   unsigned mainDstReg = MRI.createVirtualRegister(RC);
   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
 
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index c8ff9558cc88..fee008b9572a 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -104,8 +104,9 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI,
   MachineOperand &LowOffsetOp = MI->getOperand(2);
   LowOffsetOp.setImm(LowOffsetOp.getImm() + 8);
 
-  // Clear the kill flags for the base and index registers in the first
-  // instruction.
+  // Clear the kill flags on the registers in the first instruction.
+  if (EarlierMI->getOperand(0).isReg() && EarlierMI->getOperand(0).isUse())
+    EarlierMI->getOperand(0).setIsKill(false);
   EarlierMI->getOperand(1).setIsKill(false);
   EarlierMI->getOperand(3).setIsKill(false);
 
@@ -1114,10 +1115,9 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     return nullptr;
 
   unsigned OpNum = Ops[0];
-  assert(Size ==
-             MF.getRegInfo()
-                 .getRegClass(MI.getOperand(OpNum).getReg())
-                 ->getSize() &&
+  assert(Size * 8 ==
+           TRI->getRegSizeInBits(*MF.getRegInfo()
+                               .getRegClass(MI.getOperand(OpNum).getReg())) &&
          "Invalid size combination");
 
   if ((Opcode == SystemZ::AHI || Opcode == SystemZ::AGHI) && OpNum == 0 &&
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index d9c2dba5bace..4178ec0b28f0 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -45,10 +45,11 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 
 MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const {
+  const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const TargetRegisterClass *TRC = MRI->getRegClass(RegNo);
   for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64, MVT::v16i8, MVT::v8i16,
                 MVT::v4i32, MVT::v4f32})
-    if (TRC->hasType(T))
+    if (TRI->isTypeLegalForClass(*TRC, T))
       return T;
   DEBUG(errs() << "Unknown type for register number: " << RegNo);
   llvm_unreachable("Unknown register type");
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 324da650e74e..c1cfc82b4a81 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3094,6 +3094,7 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
   else if (IDVal.startswith(".code"))
     return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
   else if (IDVal.startswith(".att_syntax")) {
+    getParser().setParsingInlineAsm(false);
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
       if (Parser.getTok().getString() == "prefix")
         Parser.Lex();
@@ -3106,6 +3107,7 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
     return false;
   } else if (IDVal.startswith(".intel_syntax")) {
     getParser().setAssemblerDialect(1);
+    getParser().setParsingInlineAsm(true);
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
       if (Parser.getTok().getString() == "noprefix")
         Parser.Lex();
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index fdcc7e1ab7b0..19c93cfff0fe 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -95,7 +95,8 @@ void initializeFixupBWInstPassPass(PassRegistry &);
 /// encoding when possible in order to reduce code size.
 FunctionPass *createX86EvexToVexInsts();
 
-InstructionSelector *createX86InstructionSelector(X86Subtarget &,
+InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
+                                                  X86Subtarget &,
                                                   X86RegisterBankInfo &);
 
 void initializeEvexToVexInstPassPass(PassRegistry &);
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 8fcc8e31d5d4..d2f650cf8f47 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -273,6 +273,16 @@ def FeatureFastSHLDRotate
           "fast-shld-rotate", "HasFastSHLDRotate", "true",
           "SHLD can be used as a faster rotate">;
 
+// Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka
+// "string operations"). See "REP String Enhancement" in the Intel Software
+// Development Manual. This feature essentially means that REP MOVSB will copy
+// using the largest available size instead of copying bytes one by one, making
+// it at least as fast as REPMOVS{W,D,Q}.
+def FeatureERMSB
+    : SubtargetFeature<
+          "ermsb", "HasERMSB", "true",
+          "REP MOVS/STOS are fast">;
+
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
 //===----------------------------------------------------------------------===//
@@ -498,6 +508,7 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
   FeatureAVX2,
   FeatureBMI,
   FeatureBMI2,
+  FeatureERMSB,
   FeatureFMA,
   FeatureLZCNT,
   FeatureMOVBE,
diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp
index 137ef166aaeb..161bfa7b5474 100644
--- a/lib/Target/X86/X86CallLowering.cpp
+++ b/lib/Target/X86/X86CallLowering.cpp
@@ -53,7 +53,6 @@ void X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
     return;
   }
 
-  SmallVector<uint64_t, 4> BitOffsets;
   SmallVector<unsigned, 8> SplitRegs;
 
   EVT PartVT = TLI.getRegisterType(Context, VT);
@@ -64,8 +63,10 @@ void X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
         ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*PartTy, DL)),
                 PartTy, OrigArg.Flags};
     SplitArgs.push_back(Info);
-    PerformArgSplit(Info.Reg, PartVT.getSizeInBits() * i);
+    SplitRegs.push_back(Info.Reg);
   }
+
+  PerformArgSplit(SplitRegs);
 }
 
 namespace {
@@ -112,10 +113,9 @@ bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
     setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F);
 
     SmallVector<ArgInfo, 8> SplitArgs;
-    splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
-                      [&](unsigned Reg, uint64_t Offset) {
-                        MIRBuilder.buildExtract(Reg, VReg, Offset);
-                      });
+    splitToValueTypes(
+        OrigArg, SplitArgs, DL, MRI,
+        [&](ArrayRef<unsigned> Regs) { MIRBuilder.buildUnmerge(Regs, VReg); });
 
     FuncReturnHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86);
     if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
@@ -183,22 +183,10 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   for (auto &Arg : F.args()) {
     ArgInfo OrigArg(VRegs[Idx], Arg.getType());
     setArgFlags(OrigArg, Idx + 1, DL, F);
-    LLT Ty = MRI.getType(VRegs[Idx]);
-    unsigned Dst = VRegs[Idx];
-    bool Split = false;
     splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
-                      [&](unsigned Reg, uint64_t Offset) {
-                        if (!Split) {
-                          Split = true;
-                          Dst = MRI.createGenericVirtualRegister(Ty);
-                          MIRBuilder.buildUndef(Dst);
-                        }
-                        unsigned Tmp = MRI.createGenericVirtualRegister(Ty);
-                        MIRBuilder.buildInsert(Tmp, Dst, Reg, Offset);
-                        Dst = Tmp;
+                      [&](ArrayRef<unsigned> Regs) {
+                        MIRBuilder.buildMerge(VRegs[Idx], Regs);
                       });
-    if (Dst != VRegs[Idx])
-      MIRBuilder.buildCopy(VRegs[Idx], Dst);
     Idx++;
   }
 
diff --git a/lib/Target/X86/X86CallLowering.h b/lib/Target/X86/X86CallLowering.h
index 204e6974c702..8a8afb568298 100644
--- a/lib/Target/X86/X86CallLowering.h
+++ b/lib/Target/X86/X86CallLowering.h
@@ -34,14 +34,15 @@ public:
 
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<unsigned> VRegs) const override;
+
 private:
   /// A function of this type is used to perform value split action.
-  typedef std::function<void(unsigned, uint64_t)> SplitArgTy;
+  typedef std::function<void(ArrayRef<unsigned>)> SplitArgTy;
 
   void splitToValueTypes(const ArgInfo &OrigArgInfo,
                          SmallVectorImpl<ArgInfo> &SplitArgs,
                          const DataLayout &DL, MachineRegisterInfo &MRI,
                          SplitArgTy SplitArg) const;
 };
-} // End of namespace llvm;
+} // namespace llvm
 #endif
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 036f5d2610e4..b8477810b4c9 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -2149,7 +2149,8 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
   if (!LHSReg || !RHSReg)
     return false;
 
-  unsigned Opc = X86::getCMovFromCond(CC, RC->getSize());
+  const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo();
+  unsigned Opc = X86::getCMovFromCond(CC, TRI.getRegSizeInBits(*RC)/8);
   unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill,
                                        LHSReg, LHSIsKill);
   updateValueMap(I, ResultReg);
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 8678a13b95d0..a94045cd536d 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -1783,6 +1783,14 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
   return Offset + FPDelta;
 }
 
+int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF,
+                                               int FI, unsigned &FrameReg,
+                                               int Adjustment) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  FrameReg = TRI->getStackRegister();
+  return MFI.getObjectOffset(FI) - getOffsetOfLocalArea() + Adjustment;
+}
+
 int
 X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
                                                  int FI, unsigned &FrameReg,
@@ -1839,9 +1847,6 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
   assert(MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta() >= 0 &&
          "we don't handle this case!");
 
-  // Fill in FrameReg output argument.
-  FrameReg = TRI->getStackRegister();
-
   // This is how the math works out:
   //
   //  %rsp grows (i.e. gets lower) left to right. Each box below is
@@ -1866,12 +1871,8 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
   // (C - E) == (C - A) - (B - A) + (B - E)
   //            { Using [1], [2] and [3] above }
   //         == getObjectOffset - LocalAreaOffset + StackSize
-  //
-
-  // Get the Offset from the StackPointer
-  int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
 
-  return Offset + StackSize;
+  return getFrameIndexReferenceSP(MF, FI, FrameReg, StackSize);
 }
 
 bool X86FrameLowering::assignCalleeSavedSpillSlots(
@@ -1923,14 +1924,15 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
       continue;
 
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    unsigned Size = TRI->getSpillSize(*RC);
+    unsigned Align = TRI->getSpillAlignment(*RC);
     // ensure alignment
-    SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment();
+    SpillSlotOffset -= std::abs(SpillSlotOffset) % Align;
     // spill into slot
-    SpillSlotOffset -= RC->getSize();
-    int SlotIndex =
-        MFI.CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset);
+    SpillSlotOffset -= Size;
+    int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset);
     CSI[i - 1].setFrameIdx(SlotIndex);
-    MFI.ensureMaxAlignment(RC->getAlignment());
+    MFI.ensureMaxAlignment(Align);
   }
 
   return true;
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 863dc8b22968..7d214cabad53 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -100,6 +100,8 @@ public:
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const override;
 
+  int getFrameIndexReferenceSP(const MachineFunction &MF,
+                               int FI, unsigned &SPReg, int Adjustment) const;
   int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
                                      unsigned &FrameReg,
                                      bool IgnoreSPUpdates) const override;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b5f29fb400ef..ada46643a5fe 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5060,8 +5060,8 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
 
   // If the input is a buildvector just emit a smaller one.
   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
-                       makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
+    return DAG.getBuildVector(
+        ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
 
   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
@@ -14424,8 +14424,8 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
   // If the input is a buildvector just emit a smaller one.
   unsigned ElemsPerChunk = ResVT.getVectorNumElements();
   if (In.getOpcode() == ISD::BUILD_VECTOR)
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, ResVT,
-                       makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
+    return DAG.getBuildVector(
+        ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
 
   // Everything else is legal.
   return Op;
@@ -25944,6 +25944,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   const BasicBlock *BB = MBB->getBasicBlock();
@@ -25960,7 +25961,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
 
   DstReg = MI.getOperand(CurOp++).getReg();
   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
-  assert(RC->hasType(MVT::i32) && "Invalid destination!");
+  assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
+  (void)TRI;
   unsigned mainDstReg = MRI.createVirtualRegister(RC);
   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
 
@@ -30207,7 +30209,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
     APInt KnownZero, KnownOne;
     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
                                           DCI.isBeforeLegalizeOps());
-    if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
+    if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
                                  TLO)) {
       // If we changed the computation somewhere in the DAG, this change will
@@ -33777,7 +33779,7 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                           !DCI.isBeforeLegalizeOps());
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
+    if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) ||
         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
       DCI.CommitTargetLoweringOpt(TLO);
   }
@@ -35937,7 +35939,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
   // turn into {ax},{dx}.
   // MVT::Other is used to specify clobber names.
-  if (Res.second->hasType(VT) || VT == MVT::Other)
+  if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
     return Res;   // Correct type already, nothing to do.
 
   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
@@ -35975,11 +35977,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       Res.second = &X86::FR32RegClass;
     else if (VT == MVT::f64 || VT == MVT::i64)
       Res.second = &X86::FR64RegClass;
-    else if (X86::VR128RegClass.hasType(VT))
+    else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
       Res.second = &X86::VR128RegClass;
-    else if (X86::VR256RegClass.hasType(VT))
+    else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
       Res.second = &X86::VR256RegClass;
-    else if (X86::VR512RegClass.hasType(VT))
+    else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
       Res.second = &X86::VR512RegClass;
     else {
       // Type mismatch and not a clobber: Return an error;
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index bfd21c062aa2..66382014f6e8 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -989,10 +989,12 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
       }
     } // Constraints = "$src1 = $dst"
 
-    def NAME#8mr    : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>;
-    def NAME#16mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>;
-    def NAME#32mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>;
-    def NAME#64mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>;
+    let mayLoad = 1, mayStore = 1 in {
+      def NAME#8mr    : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>;
+      def NAME#16mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>;
+      def NAME#32mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>;
+      def NAME#64mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>;
+    }
 
     // NOTE: These are order specific, we want the mi8 forms to be listed
     // first so that they are slightly preferred to the mi forms.
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 7b456fd68343..26444dd1f619 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -6284,9 +6284,11 @@ void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
                                 ArrayRef<MachineOperand> Cond, unsigned TrueReg,
                                 unsigned FalseReg) const {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
   assert(Cond.size() == 1 && "Invalid Cond array");
   unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
-                                 MRI.getRegClass(DstReg)->getSize(),
+                                 TRI.getRegSizeInBits(RC) / 8,
                                  false /*HasMemoryOperand*/);
   BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
 }
@@ -6557,7 +6559,7 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
   bool HasAVX512 = STI.hasAVX512();
   bool HasVLX = STI.hasVLX();
 
-  switch (RC->getSize()) {
+  switch (STI.getRegisterInfo()->getSpillSize(*RC)) {
   default:
     llvm_unreachable("Unknown spill size");
   case 1:
@@ -6603,28 +6605,36 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
     assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
     return load ? X86::LD_Fp80m : X86::ST_FpP80m;
   case 16: {
-    assert(X86::VR128XRegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass");
-    // If stack is realigned we can use aligned stores.
-    if (isStackAligned)
-      return load ?
-        (HasVLX    ? X86::VMOVAPSZ128rm :
-         HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
-         HasAVX    ? X86::VMOVAPSrm :
-                     X86::MOVAPSrm):
-        (HasVLX    ? X86::VMOVAPSZ128mr :
-         HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX :
-         HasAVX    ? X86::VMOVAPSmr :
-                     X86::MOVAPSmr);
-    else
-      return load ?
-        (HasVLX    ? X86::VMOVUPSZ128rm :
-         HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX :
-         HasAVX    ? X86::VMOVUPSrm :
-                     X86::MOVUPSrm):
-        (HasVLX    ? X86::VMOVUPSZ128mr :
-         HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX :
-         HasAVX    ? X86::VMOVUPSmr :
-                     X86::MOVUPSmr);
+    if (X86::VR128XRegClass.hasSubClassEq(RC)) {
+      // If stack is realigned we can use aligned stores.
+      if (isStackAligned)
+        return load ?
+          (HasVLX    ? X86::VMOVAPSZ128rm :
+           HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
+           HasAVX    ? X86::VMOVAPSrm :
+                       X86::MOVAPSrm):
+          (HasVLX    ? X86::VMOVAPSZ128mr :
+           HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX :
+           HasAVX    ? X86::VMOVAPSmr :
+                       X86::MOVAPSmr);
+      else
+        return load ?
+          (HasVLX    ? X86::VMOVUPSZ128rm :
+           HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX :
+           HasAVX    ? X86::VMOVUPSrm :
+                       X86::MOVUPSrm):
+          (HasVLX    ? X86::VMOVUPSZ128mr :
+           HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX :
+           HasAVX    ? X86::VMOVUPSmr :
+                       X86::MOVUPSmr);
+    }
+    if (X86::BNDRRegClass.hasSubClassEq(RC)) {
+      if (STI.is64Bit())
+        return load ? X86::BNDMOVRM64rm : X86::BNDMOVMR64mr;
+      else
+        return load ? X86::BNDMOVRM32rm : X86::BNDMOVMR32mr;
+    }
+    llvm_unreachable("Unknown 16-byte regclass");
   }
   case 32:
     assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
@@ -6709,9 +6719,9 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
-  assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= RC->getSize() &&
+  assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
          "Stack slot too small for store");
-  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
   bool isAligned =
       (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
       RI.canRealignStack(MF);
@@ -6728,7 +6738,8 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
                                   MachineInstr::mmo_iterator MMOBegin,
                                   MachineInstr::mmo_iterator MMOEnd,
                                   SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
   bool isAligned = MMOBegin != MMOEnd &&
                    (*MMOBegin)->getAlignment() >= Alignment;
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
@@ -6748,7 +6759,7 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         const TargetRegisterClass *RC,
                                         const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
-  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
   bool isAligned =
       (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
       RI.canRealignStack(MF);
@@ -6763,7 +6774,8 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
                                  MachineInstr::mmo_iterator MMOBegin,
                                  MachineInstr::mmo_iterator MMOEnd,
                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
   bool isAligned = MMOBegin != MMOEnd &&
                    (*MMOBegin)->getAlignment() >= Alignment;
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
@@ -7222,7 +7234,8 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
         NewOpc = getSETFromCond(NewCC, HasMemoryOperand);
       else {
         unsigned DstReg = Instr.getOperand(0).getReg();
-        NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(),
+        const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
+        NewOpc = getCMovFromCond(NewCC, TRI->getRegSizeInBits(*DstRC)/8,
                                  HasMemoryOperand);
       }
 
@@ -7750,7 +7763,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
       unsigned DstIdx = (Imm >> 4) & 3;
       unsigned SrcIdx = (Imm >> 6) & 3;
 
-      unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
+      const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
+      unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
       if (Size <= RCSize && 4 <= Align) {
         int PtrOffset = SrcIdx * 4;
         unsigned NewImm = (DstIdx << 4) | ZMask;
@@ -7772,7 +7787,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
     // To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
     // TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
     if (OpNum == 2) {
-      unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
+      const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
+      unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
       if (Size <= RCSize && 8 <= Align) {
         unsigned NewOpCode =
             (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm :
@@ -7861,7 +7878,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
         return nullptr;
       bool NarrowToMOV32rm = false;
       if (Size) {
-        unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
+        const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+        const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum,
+                                                    &RI, MF);
+        unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
         if (Size < RCSize) {
           // Check if it's safe to fold the load. If the size of the object is
           // narrower than the load width, then it's not.
@@ -8302,11 +8322,13 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
                                              const MachineFunction &MF) {
   unsigned Opc = LoadMI.getOpcode();
   unsigned UserOpc = UserMI.getOpcode();
-  unsigned RegSize =
-      MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize();
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  const TargetRegisterClass *RC = 
+      MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg());
+  unsigned RegSize = TRI.getRegSizeInBits(*RC);
 
   if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm) &&
-      RegSize > 4) {
+      RegSize > 32) {
     // These instructions only load 32 bits, we can't fold them if the
     // destination register is wider than 32 bits (4 bytes), and its user
     // instruction isn't scalar (SS).
@@ -8357,7 +8379,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
   }
 
   if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm) &&
-      RegSize > 8) {
+      RegSize > 64) {
     // These instructions only load 64 bits, we can't fold them if the
     // destination register is wider than 64 bits (8 bytes), and its user
     // instruction isn't scalar (SD).
@@ -8702,6 +8724,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   bool FoldedStore = I->second.second & TB_FOLDED_STORE;
   const MCInstrDesc &MCID = get(Opc);
   MachineFunction &MF = DAG.getMachineFunction();
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
   unsigned NumDefs = MCID.NumDefs;
   std::vector<SDValue> AddrOps;
@@ -8724,7 +8747,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   // Emit the load instruction.
   SDNode *Load = nullptr;
   if (FoldedLoad) {
-    EVT VT = *RC->vt_begin();
+    EVT VT = *TRI.legalclasstypes_begin(*RC);
     std::pair<MachineInstr::mmo_iterator,
               MachineInstr::mmo_iterator> MMOs =
       MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
@@ -8736,7 +8759,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
       return false;
     // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
     // memory access is slow above.
-    unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+    unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
     bool isAligned = (*MMOs.first) &&
                      (*MMOs.first)->getAlignment() >= Alignment;
     Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
@@ -8752,7 +8775,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   const TargetRegisterClass *DstRC = nullptr;
   if (MCID.getNumDefs() > 0) {
     DstRC = getRegClass(MCID, 0, &RI, MF);
-    VTs.push_back(*DstRC->vt_begin());
+    VTs.push_back(*TRI.legalclasstypes_begin(*DstRC));
   }
   for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
     EVT VT = N->getValueType(i);
@@ -8781,7 +8804,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
       return false;
     // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
     // memory access is slow above.
-    unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+    unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
     bool isAligned = (*MMOs.first) &&
                      (*MMOs.first)->getAlignment() >= Alignment;
     SDNode *Store =
@@ -9514,7 +9537,7 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
 }
 
 /// Return the noop instruction to use for a noop.
-void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+void X86InstrInfo::getNoop(MCInst &NopInst) const {
   NopInst.setOpcode(X86::NOOP);
 }
 
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 2fee48570ce1..38567831b3a4 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -457,7 +457,7 @@ public:
                                int64_t Offset1, int64_t Offset2,
                                unsigned NumLoads) const override;
 
-  void getNoopForMachoTarget(MCInst &NopInst) const override;
+  void getNoop(MCInst &NopInst) const override;
 
   bool
   reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index e31d2769047b..c3def461afdc 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -897,6 +897,7 @@ def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
 def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
 def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
 def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
+def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
 def HasMFence    : Predicate<"Subtarget->hasMFence()">;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index e1bf28cbf612..f22a50200c9a 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -4602,17 +4602,17 @@ def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                         (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
                         IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
 def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
-                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        "movq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v2i64 (scalar_to_vector GR64:$src)))],
                           IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
 def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
-                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        "movq\t{$src, $dst|$dst, $src}",
                         [], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
 let isCodeGenOnly = 1 in
 def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
-                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       "movq\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert GR64:$src))],
                        IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
 } // ExeDomain = SSEPackedInt
@@ -4681,7 +4681,7 @@ def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
                       VEX;
 
 def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
-                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        "movq\t{$src, $dst|$dst, $src}",
                         [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
                                                          (iPTR 0)))],
                                                          IIC_SSE_MOVD_ToGP>;
@@ -4694,7 +4694,7 @@ def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
                           [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
 def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
-                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        "movq\t{$src, $dst|$dst, $src}",
                         [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
 } // ExeDomain = SSEPackedInt
 
@@ -4721,7 +4721,7 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
                          [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
                          IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
   def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
-                         "mov{d|q}\t{$src, $dst|$dst, $src}",
+                         "movq\t{$src, $dst|$dst, $src}",
                          [(set GR64:$dst, (bitconvert FR64:$src))],
                          IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
   def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
@@ -4811,12 +4811,12 @@ let Predicates = [UseSSE2] in {
   }
 }
 
-// These are the correct encodings of the instructions so that we know how to
-// read correct assembly, even though we continue to emit the wrong ones for
-// compatibility with Darwin's buggy assembler.
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
+// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
+// these aliases.
+def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
                 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
                 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
 // Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
@@ -7144,33 +7144,37 @@ let Predicates = [UseSSE41] in {
 /// SS42I_binop_rm - Simple SSE 4.2 binary operator
 multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
-                          X86MemOperand x86memop, bit Is2Addr = 1> {
+                          X86MemOperand x86memop, OpndItins itins,
+                          bit Is2Addr = 1> {
   def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
        (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>;
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, Sched<[itins.Sched]>;
   def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst,
-         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>;
+         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in
   defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
-                                 loadv2i64, i128mem, 0>, VEX_4V, VEX_WIG;
+                                 loadv2i64, i128mem, SSE_INTALU_ITINS_P, 0>,
+                                 VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2] in
   defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
-                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L, VEX_WIG;
+                                  loadv4i64, i256mem, SSE_INTALU_ITINS_P, 0>,
+                                  VEX_4V, VEX_L, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
-                                memopv2i64, i128mem>;
+                                memopv2i64, i128mem, SSE_INTALU_ITINS_P>;
 
 //===----------------------------------------------------------------------===//
 // SSE4.2 - String/text Processing Instructions
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index fb9315792892..d0f1b7091da9 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -39,11 +39,16 @@ using namespace llvm;
 
 namespace {
 
+#define GET_GLOBALISEL_PREDICATE_BITSET
+#include "X86GenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATE_BITSET
+
 class X86InstructionSelector : public InstructionSelector {
 public:
-  X86InstructionSelector(const X86Subtarget &STI,
+  X86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &STI,
                          const X86RegisterBankInfo &RBI);
 
+  void beginFunction(const MachineFunction &MF) override;
   bool select(MachineInstr &I) const override;
 
 private:
@@ -70,10 +75,17 @@ private:
   bool selectTrunc(MachineInstr &I, MachineRegisterInfo &MRI,
                    MachineFunction &MF) const;
 
+  const X86TargetMachine &TM;
   const X86Subtarget &STI;
   const X86InstrInfo &TII;
   const X86RegisterInfo &TRI;
   const X86RegisterBankInfo &RBI;
+  bool OptForSize;
+  bool OptForMinSize;
+
+  PredicateBitset AvailableFeatures;
+  PredicateBitset computeAvailableFeatures(const MachineFunction *MF,
+                                           const X86Subtarget *Subtarget) const;
 
 #define GET_GLOBALISEL_TEMPORARIES_DECL
 #include "X86GenGlobalISel.inc"
@@ -86,10 +98,12 @@ private:
 #include "X86GenGlobalISel.inc"
 #undef GET_GLOBALISEL_IMPL
 
-X86InstructionSelector::X86InstructionSelector(const X86Subtarget &STI,
+X86InstructionSelector::X86InstructionSelector(const X86TargetMachine &TM,
+                                               const X86Subtarget &STI,
                                                const X86RegisterBankInfo &RBI)
-    : InstructionSelector(), STI(STI), TII(*STI.getInstrInfo()),
-      TRI(*STI.getRegisterInfo()), RBI(RBI)
+    : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
+      TRI(*STI.getRegisterInfo()), RBI(RBI), OptForSize(false),
+      OptForMinSize(false), AvailableFeatures()
 #define GET_GLOBALISEL_TEMPORARIES_INIT
 #include "X86GenGlobalISel.inc"
 #undef GET_GLOBALISEL_TEMPORARIES_INIT
@@ -181,6 +195,12 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
   return true;
 }
 
+void X86InstructionSelector::beginFunction(const MachineFunction &MF) {
+  OptForSize = MF.getFunction()->optForSize();
+  OptForMinSize = MF.getFunction()->optForMinSize();
+  AvailableFeatures = computeAvailableFeatures(&MF, &STI);
+}
+
 bool X86InstructionSelector::select(MachineInstr &I) const {
   assert(I.getParent() && "Instruction should be in a basic block!");
   assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -571,7 +591,8 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I,
 }
 
 InstructionSelector *
-llvm::createX86InstructionSelector(X86Subtarget &Subtarget,
+llvm::createX86InstructionSelector(const X86TargetMachine &TM,
+                                   X86Subtarget &Subtarget,
                                    X86RegisterBankInfo &RBI) {
-  return new X86InstructionSelector(Subtarget, RBI);
+  return new X86InstructionSelector(TM, Subtarget, RBI);
 }
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 9bab9a4cf3ba..1f16f3c9a14d 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -137,25 +137,29 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
     case X86::FR32RegClassID:
     case X86::FR64RegClassID:
       // If AVX-512 isn't supported we should only inflate to these classes.
-      if (!Subtarget.hasAVX512() && Super->getSize() == RC->getSize())
+      if (!Subtarget.hasAVX512() &&
+          getRegSizeInBits(*Super) == getRegSizeInBits(*RC))
         return Super;
       break;
     case X86::VR128RegClassID:
     case X86::VR256RegClassID:
       // If VLX isn't supported we should only inflate to these classes.
-      if (!Subtarget.hasVLX() && Super->getSize() == RC->getSize())
+      if (!Subtarget.hasVLX() &&
+          getRegSizeInBits(*Super) == getRegSizeInBits(*RC))
         return Super;
       break;
     case X86::VR128XRegClassID:
     case X86::VR256XRegClassID:
       // If VLX isn't support we shouldn't inflate to these classes.
-      if (Subtarget.hasVLX() && Super->getSize() == RC->getSize())
+      if (Subtarget.hasVLX() &&
+          getRegSizeInBits(*Super) == getRegSizeInBits(*RC))
         return Super;
       break;
     case X86::FR32XRegClassID:
     case X86::FR64XRegClassID:
       // If AVX-512 isn't support we shouldn't inflate to these classes.
-      if (Subtarget.hasAVX512() && Super->getSize() == RC->getSize())
+      if (Subtarget.hasAVX512() &&
+          getRegSizeInBits(*Super) == getRegSizeInBits(*RC))
         return Super;
       break;
     case X86::GR8RegClassID:
@@ -168,7 +172,7 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
     case X86::VR512RegClassID:
       // Don't return a super-class that would shrink the spill size.
       // That can happen with the vector and float classes.
-      if (Super->getSize() == RC->getSize())
+      if (getRegSizeInBits(*Super) == getRegSizeInBits(*RC))
         return Super;
     }
     Super = *I++;
@@ -669,32 +673,28 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineFunction &MF = *MI.getParent()->getParent();
   const X86FrameLowering *TFI = getFrameLowering(MF);
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
-  unsigned BasePtr;
 
-  unsigned Opc = MI.getOpcode();
-  bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm ||
-                    Opc == X86::TCRETURNmi || Opc == X86::TCRETURNmi64;
-
-  if (hasBasePointer(MF))
-    BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister());
-  else if (needsStackRealignment(MF))
-    BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr);
-  else if (AfterFPPop)
-    BasePtr = StackPtr;
-  else
-    BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr);
+  // Determine base register and offset.
+  int FIOffset;
+  unsigned BasePtr;
+  if (MI.isReturn()) {
+    assert((!needsStackRealignment(MF) ||
+           MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) &&
+           "Return instruction can only reference SP relative frame objects");
+    FIOffset = TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0);
+  } else {
+    FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr);
+  }
 
   // LOCAL_ESCAPE uses a single offset, with no register. It only works in the
   // simple FP case, and doesn't work with stack realignment. On 32-bit, the
   // offset is from the traditional base pointer location.  On 64-bit, the
   // offset is from the SP at the end of the prologue, not the FP location. This
   // matches the behavior of llvm.frameaddress.
-  unsigned IgnoredFrameReg;
+  unsigned Opc = MI.getOpcode();
   if (Opc == TargetOpcode::LOCAL_ESCAPE) {
     MachineOperand &FI = MI.getOperand(FIOperandNum);
-    int Offset;
-    Offset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
-    FI.ChangeToImmediate(Offset);
+    FI.ChangeToImmediate(FIOffset);
     return;
   }
 
@@ -710,15 +710,6 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // FrameIndex with base register.  Add an offset to the offset.
   MI.getOperand(FIOperandNum).ChangeToRegister(MachineBasePtr, false);
 
-  // Now add the frame object offset to the offset from EBP.
-  int FIOffset;
-  if (AfterFPPop) {
-    // Tail call jmp happens after FP is popped.
-    const MachineFrameInfo &MFI = MF.getFrameInfo();
-    FIOffset = MFI.getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea();
-  } else
-    FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
-
   if (BasePtr == StackPtr)
     FIOffset += SPAdj;
 
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 9da8a18965ea..1a72a0ba3a64 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -106,7 +106,6 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
   SDValue Count;
   ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
   unsigned BytesLeft = 0;
-  bool TwoRepStos = false;
   if (ValC) {
     unsigned ValReg;
     uint64_t Val = ValC->getZExtValue() & 255;
@@ -163,20 +162,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
   SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
   Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
 
-  if (TwoRepStos) {
-    InFlag = Chain.getValue(1);
-    Count  = Size;
-    EVT CVT = Count.getValueType();
-    SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
-                               DAG.getConstant((AVT == MVT::i64) ? 7 : 3, dl,
-                                               CVT));
-    Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : X86::ECX,
-                             Left, InFlag);
-    InFlag = Chain.getValue(1);
-    Tys = DAG.getVTList(MVT::Other, MVT::Glue);
-    SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
-    Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
-  } else if (BytesLeft) {
+  if (BytesLeft) {
     // Handle the last 1 - 7 bytes.
     unsigned Offset = SizeVal - BytesLeft;
     EVT AddrVT = Dst.getValueType();
@@ -195,6 +181,24 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
   return Chain;
 }
 
+namespace {
+
+// Represents a cover of a buffer of SizeVal bytes with blocks of size
+// AVT, as well as how many bytes remain (BytesLeft is always smaller than
+// the block size).
+struct RepMovsRepeats {
+  RepMovsRepeats(const uint64_t SizeVal, const MVT& AVT) {
+    const unsigned UBytes = AVT.getSizeInBits() / 8;
+    Count = SizeVal / UBytes;
+    BytesLeft = SizeVal % UBytes;
+  }
+
+  unsigned Count;
+  unsigned BytesLeft;
+};
+
+}  // namespace
+
 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
     SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
@@ -229,7 +233,12 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
     return SDValue();
 
   MVT AVT;
-  if (Align & 1)
+  if (Subtarget.hasERMSB())
+    // If the target has enhanced REPMOVSB, then it's at least as fast to use
+    // REP MOVSB instead of REP MOVS{W,D,Q}, and it avoids having to handle
+    // BytesLeft.
+    AVT = MVT::i8;
+  else if (Align & 1)
     AVT = MVT::i8;
   else if (Align & 2)
     AVT = MVT::i16;
@@ -240,14 +249,18 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
     // QWORD aligned
     AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
 
-  unsigned UBytes = AVT.getSizeInBits() / 8;
-  unsigned CountVal = SizeVal / UBytes;
-  SDValue Count = DAG.getIntPtrConstant(CountVal, dl);
-  unsigned BytesLeft = SizeVal % UBytes;
+  RepMovsRepeats Repeats(SizeVal, AVT);
+  if (Repeats.BytesLeft > 0 &&
+      DAG.getMachineFunction().getFunction()->optForMinSize()) {
+    // When agressively optimizing for size, avoid generating the code to handle
+    // BytesLeft.
+    AVT = MVT::i8;
+    Repeats = RepMovsRepeats(SizeVal, AVT);
+  }
 
   SDValue InFlag;
   Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX,
-                           Count, InFlag);
+                           DAG.getIntPtrConstant(Repeats.Count, dl), InFlag);
   InFlag = Chain.getValue(1);
   Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI,
                            Dst, InFlag);
@@ -262,9 +275,9 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
 
   SmallVector<SDValue, 4> Results;
   Results.push_back(RepMovs);
-  if (BytesLeft) {
+  if (Repeats.BytesLeft) {
     // Handle the last 1 - 7 bytes.
-    unsigned Offset = SizeVal - BytesLeft;
+    unsigned Offset = SizeVal - Repeats.BytesLeft;
     EVT DstVT = Dst.getValueType();
     EVT SrcVT = Src.getValueType();
     EVT SizeVT = Size.getValueType();
@@ -275,7 +288,8 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
                                     DAG.getNode(ISD::ADD, dl, SrcVT, Src,
                                                 DAG.getConstant(Offset, dl,
                                                                 SrcVT)),
-                                    DAG.getConstant(BytesLeft, dl, SizeVT),
+                                    DAG.getConstant(Repeats.BytesLeft, dl,
+                                                    SizeVT),
                                     Align, isVolatile, AlwaysInline, false,
                                     DstPtrInfo.getWithOffset(Offset),
                                     SrcPtrInfo.getWithOffset(Offset)));
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 92a68759195c..4154530d04e7 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -303,6 +303,7 @@ void X86Subtarget::initializeEnvironment() {
   HasFastVectorFSQRT = false;
   HasFastLZCNT = false;
   HasFastSHLDRotate = false;
+  HasERMSB = false;
   HasSlowDivide32 = false;
   HasSlowDivide64 = false;
   PadShortFunctions = false;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index d0d88d326949..fd057f36c890 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -232,6 +232,9 @@ protected:
   /// True if SHLD based rotate is fast.
   bool HasFastSHLDRotate;
 
+  /// True if the processor has enhanced REP MOVSB/STOSB.
+  bool HasERMSB;
+
   /// True if the short functions should be padded to prevent
   /// a stall when returning too early.
   bool PadShortFunctions;
@@ -472,6 +475,7 @@ public:
   bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
   bool hasFastLZCNT() const { return HasFastLZCNT; }
   bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
+  bool hasERMSB() const { return HasERMSB; }
   bool hasSlowDivide32() const { return HasSlowDivide32; }
   bool hasSlowDivide64() const { return HasSlowDivide64; }
   bool padShortFunctions() const { return PadShortFunctions; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 03a1958121ab..623cf38aa951 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -286,7 +286,7 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
 
     auto *RBI = new X86RegisterBankInfo(*I->getRegisterInfo());
     GISel->RegBankInfo.reset(RBI);
-    GISel->InstSelector.reset(createX86InstructionSelector(*I, *RBI));
+    GISel->InstSelector.reset(createX86InstructionSelector(*this, *I, *RBI));
 #endif
     I->setGISelAccessor(*GISel);
   }
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index a752357400b3..784612038c09 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -575,18 +575,17 @@ processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                     RegScavenger *RS) const {
   assert(RS && "requiresRegisterScavenging failed");
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  const TargetRegisterClass &RC = XCore::GRRegsRegClass;
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   // Reserve slots close to SP or frame pointer for Scavenging spills.
   // When using SP for small frames, we don't need any scratch registers.
   // When using SP for large frames, we may need 2 scratch registers.
   // When using FP, for large or small frames, we may need 1 scratch register.
+  unsigned Size = TRI.getSpillSize(RC);
+  unsigned Align = TRI.getSpillAlignment(RC);
   if (XFI->isLargeFrame(MF) || hasFP(MF))
-    RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
-                                                      RC->getAlignment(),
-                                                      false));
+    RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false));
   if (XFI->isLargeFrame(MF) && !hasFP(MF))
-    RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
-                                                      RC->getAlignment(),
-                                                      false));
+    RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false));
 }
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 45437815fa37..2efcd46cd8d4 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -1605,7 +1605,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
         TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                               !DCI.isBeforeLegalizeOps());
         const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-        if (TLO.ShrinkDemandedConstant(OutVal, DemandedMask) ||
+        if (TLI.ShrinkDemandedConstant(OutVal, DemandedMask, TLO) ||
             TLI.SimplifyDemandedBits(OutVal, DemandedMask, KnownZero, KnownOne,
                                      TLO))
           DCI.CommitTargetLoweringOpt(TLO);
@@ -1622,7 +1622,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
         TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                               !DCI.isBeforeLegalizeOps());
         const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-        if (TLO.ShrinkDemandedConstant(Time, DemandedMask) ||
+        if (TLI.ShrinkDemandedConstant(Time, DemandedMask, TLO) ||
             TLI.SimplifyDemandedBits(Time, DemandedMask, KnownZero, KnownOne,
                                      TLO))
           DCI.CommitTargetLoweringOpt(TLO);
diff --git a/lib/Target/XCore/XCoreMachineFunctionInfo.cpp b/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
index e91536ca1e83..75af0e97dfb5 100644
--- a/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
+++ b/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
@@ -10,6 +10,7 @@
 #include "XCoreMachineFunctionInfo.h"
 #include "XCoreInstrInfo.h"
 #include "llvm/IR/Function.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -35,13 +36,15 @@ int XCoreFunctionInfo::createLRSpillSlot(MachineFunction &MF) {
   if (LRSpillSlotSet) {
     return LRSpillSlot;
   }
-  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  const TargetRegisterClass &RC = XCore::GRRegsRegClass;
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   if (! MF.getFunction()->isVarArg()) {
     // A fixed offset of 0 allows us to save / restore LR using entsp / retsp.
-    LRSpillSlot = MFI.CreateFixedObject(RC->getSize(), 0, true);
+    LRSpillSlot = MFI.CreateFixedObject(TRI.getSpillSize(RC), 0, true);
   } else {
-    LRSpillSlot = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+    LRSpillSlot = MFI.CreateStackObject(TRI.getSpillSize(RC),
+                                        TRI.getSpillAlignment(RC), true);
   }
   LRSpillSlotSet = true;
   return LRSpillSlot;
@@ -51,9 +54,11 @@ int XCoreFunctionInfo::createFPSpillSlot(MachineFunction &MF) {
   if (FPSpillSlotSet) {
     return FPSpillSlot;
   }
-  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  const TargetRegisterClass &RC = XCore::GRRegsRegClass;
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  FPSpillSlot = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+  FPSpillSlot = MFI.CreateStackObject(TRI.getSpillSize(RC),
+                                      TRI.getSpillAlignment(RC), true);
   FPSpillSlotSet = true;
   return FPSpillSlot;
 }
@@ -62,10 +67,13 @@ const int* XCoreFunctionInfo::createEHSpillSlot(MachineFunction &MF) {
   if (EHSpillSlotSet) {
     return EHSpillSlot;
   }
-  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  const TargetRegisterClass &RC = XCore::GRRegsRegClass;
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  EHSpillSlot[0] = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true);
-  EHSpillSlot[1] = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+  unsigned Size = TRI.getSpillSize(RC);
+  unsigned Align = TRI.getSpillAlignment(RC);
+  EHSpillSlot[0] = MFI.CreateStackObject(Size, Align, true);
+  EHSpillSlot[1] = MFI.CreateStackObject(Size, Align, true);
   EHSpillSlotSet = true;
   return EHSpillSlot;
 }