264 files changed, 11277 insertions, 3282 deletions
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index a0a09e4a833b..da22d8d9e4c5 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -206,7 +206,7 @@ void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
     // FIXME: Can we get anything other than a plain symbol here?
     assert(!MO.getTargetFlags() && "Unknown operand target flag!");
 
-    O << *Sym;
+    Sym->print(O, MAI);
     printOffset(MO.getOffset(), O);
     break;
   }
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 78a2021f79a3..1ea4abcf05fa 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -156,6 +156,9 @@ public:
 
   SDNode *SelectLIBM(SDNode *N);
 
+  SDNode *SelectReadRegister(SDNode *N);
+  SDNode *SelectWriteRegister(SDNode *N);
+
 // Include the pieces autogenerated from the target description.
 #include "AArch64GenDAGISel.inc"
 
@@ -2114,6 +2117,120 @@ AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
   return true;
 }
 
+// Inspects a register string of the form o0:op1:CRn:CRm:op2 gets the fields
+// of the string and obtains the integer values from them and combines these
+// into a single value to be used in the MRS/MSR instruction.
+static int getIntOperandFromRegisterString(StringRef RegString) {
+  SmallVector<StringRef, 5> Fields;
+  RegString.split(Fields, ":");
+
+  if (Fields.size() == 1)
+    return -1;
+
+  assert(Fields.size() == 5
+            && "Invalid number of fields in read register string");
+
+  SmallVector<int, 5> Ops;
+  bool AllIntFields = true;
+
+  for (StringRef Field : Fields) {
+    unsigned IntField;
+    AllIntFields &= !Field.getAsInteger(10, IntField);
+    Ops.push_back(IntField);
+  }
+
+  assert(AllIntFields &&
+          "Unexpected non-integer value in special register string.");
+
+  // Need to combine the integer fields of the string into a single value
+  // based on the bit encoding of MRS/MSR instruction.
+  return (Ops[0] << 14) | (Ops[1] << 11) | (Ops[2] << 7) |
+         (Ops[3] << 3) | (Ops[4]);
+}
+
+// Lower the read_register intrinsic to an MRS instruction node if the special
+// register string argument is either of the form detailed in the ALCE (the
+// form described in getIntOperandsFromRegsterString) or is a named register
+// known by the MRS SysReg mapper.
+SDNode *AArch64DAGToDAGISel::SelectReadRegister(SDNode *N) {
+  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
+  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  SDLoc DL(N);
+
+  int Reg = getIntOperandFromRegisterString(RegString->getString());
+  if (Reg != -1)
+    return CurDAG->getMachineNode(AArch64::MRS, DL, N->getSimpleValueType(0),
+                                  MVT::Other,
+                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                                  N->getOperand(0));
+
+  // Use the sysreg mapper to map the remaining possible strings to the
+  // value for the register to be used for the instruction operand.
+  AArch64SysReg::MRSMapper mapper;
+  bool IsValidSpecialReg;
+  Reg = mapper.fromString(RegString->getString(),
+                          Subtarget->getFeatureBits(),
+                          IsValidSpecialReg);
+  if (IsValidSpecialReg)
+    return CurDAG->getMachineNode(AArch64::MRS, DL, N->getSimpleValueType(0),
+                                  MVT::Other,
+                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                                  N->getOperand(0));
+
+  return nullptr;
+}
+
+// Lower the write_register intrinsic to an MSR instruction node if the special
+// register string argument is either of the form detailed in the ALCE (the
+// form described in getIntOperandsFromRegsterString) or is a named register
+// known by the MSR SysReg mapper.
+SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) {
+  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
+  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  SDLoc DL(N);
+
+  int Reg = getIntOperandFromRegisterString(RegString->getString());
+  if (Reg != -1)
+    return CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
+                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                                  N->getOperand(2), N->getOperand(0));
+
+  // Check if the register was one of those allowed as the pstatefield value in
+  // the MSR (immediate) instruction. To accept the values allowed in the
+  // pstatefield for the MSR (immediate) instruction, we also require that an
+  // immediate value has been provided as an argument, we know that this is
+  // the case as it has been ensured by semantic checking.
+  AArch64PState::PStateMapper PMapper;
+  bool IsValidSpecialReg;
+  Reg = PMapper.fromString(RegString->getString(),
+                           Subtarget->getFeatureBits(),
+                           IsValidSpecialReg);
+  if (IsValidSpecialReg) {
+    assert (isa<ConstantSDNode>(N->getOperand(2))
+              && "Expected a constant integer expression.");
+    uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+    return CurDAG->getMachineNode(AArch64::MSRpstate, DL, MVT::Other,
+                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                                  CurDAG->getTargetConstant(Immed, DL, MVT::i16),
+                                  N->getOperand(0));
+  }
+
+  // Use the sysreg mapper to attempt to map the remaining possible strings
+  // to the value for the register to be used for the MSR (register)
+  // instruction operand.
+  AArch64SysReg::MSRMapper Mapper;
+  Reg = Mapper.fromString(RegString->getString(),
+                          Subtarget->getFeatureBits(),
+                          IsValidSpecialReg);
+
+  if (IsValidSpecialReg)
+    return CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
+                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                                  N->getOperand(2), N->getOperand(0));
+
+  return nullptr;
+}
+
 SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
   // Dump information about the Node being selected
   DEBUG(errs() << "Selecting: ");
@@ -2135,6 +2252,16 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
   default:
     break;
 
+  case ISD::READ_REGISTER:
+    if (SDNode *Res = SelectReadRegister(Node))
+      return Res;
+    break;
+
+  case ISD::WRITE_REGISTER:
+    if (SDNode *Res = SelectWriteRegister(Node))
+      return Res;
+    break;
+
   case ISD::ADD:
     if (SDNode *I = SelectMLAV64LaneV128(Node))
       return I;
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index e6108c3e95e2..1616ff13535d 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -76,6 +76,9 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
     cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
     cl::init(false));
 
+/// Value type used for condition codes.
+static const MVT MVT_CC = MVT::i32;
+
 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
                                              const AArch64Subtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
@@ -807,6 +810,9 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case AArch64ISD::ADCS:              return "AArch64ISD::ADCS";
   case AArch64ISD::SBCS:              return "AArch64ISD::SBCS";
   case AArch64ISD::ANDS:              return "AArch64ISD::ANDS";
+  case AArch64ISD::CCMP:              return "AArch64ISD::CCMP";
+  case AArch64ISD::CCMN:              return "AArch64ISD::CCMN";
+  case AArch64ISD::FCCMP:             return "AArch64ISD::FCCMP";
   case AArch64ISD::FCMP:              return "AArch64ISD::FCMP";
   case AArch64ISD::FMIN:              return "AArch64ISD::FMIN";
   case AArch64ISD::FMAX:              return "AArch64ISD::FMAX";
@@ -1165,10 +1171,133 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     LHS = LHS.getOperand(0);
   }
 
-  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
+  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
       .getValue(1);
 }
 
+static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
+                                         ISD::CondCode CC, SDValue CCOp,
+                                         SDValue Condition, unsigned NZCV,
+                                         SDLoc DL, SelectionDAG &DAG) {
+  unsigned Opcode = 0;
+  if (LHS.getValueType().isFloatingPoint())
+    Opcode = AArch64ISD::FCCMP;
+  else if (RHS.getOpcode() == ISD::SUB) {
+    SDValue SubOp0 = RHS.getOperand(0);
+    if (const ConstantSDNode *SubOp0C = dyn_cast<ConstantSDNode>(SubOp0))
+      if (SubOp0C->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+        // See emitComparison() on why we can only do this for SETEQ and SETNE.
+        Opcode = AArch64ISD::CCMN;
+        RHS = RHS.getOperand(1);
+      }
+  }
+  if (Opcode == 0)
+    Opcode = AArch64ISD::CCMP;
+
+  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
+  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
+}
+
+/// Returns true if @p Val is a tree of AND/OR/SETCC operations.
+static bool isConjunctionDisjunctionTree(const SDValue Val, unsigned Depth) {
+  if (!Val.hasOneUse())
+    return false;
+  if (Val->getOpcode() == ISD::SETCC)
+    return true;
+  // Protect against stack overflow.
+  if (Depth > 1000)
+    return false;
+  if (Val->getOpcode() == ISD::AND || Val->getOpcode() == ISD::OR) {
+    SDValue O0 = Val->getOperand(0);
+    SDValue O1 = Val->getOperand(1);
+    return isConjunctionDisjunctionTree(O0, Depth+1) &&
+           isConjunctionDisjunctionTree(O1, Depth+1);
+  }
+  return false;
+}
+
+/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
+/// of CCMP/CFCMP ops. For example (SETCC_0 & SETCC_1) with condition cond0 and
+/// cond1 can be transformed into "CMP; CCMP" with CCMP executing on cond_0
+/// and setting flags to inversed(cond_1) otherwise.
+/// This recursive function produces DAG nodes that produce condition flags
+/// suitable to determine the truth value of @p Val (which is AND/OR/SETCC)
+/// by testing the result for the condition set to @p OutCC. If @p Negate is
+/// set the opposite truth value is produced. If @p CCOp and @p Condition are
+/// given then conditional comparison are created so that false is reported
+/// when they are false.
+static SDValue emitConjunctionDisjunctionTree(
+    SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate,
+    SDValue CCOp = SDValue(), AArch64CC::CondCode Condition = AArch64CC::AL) {
+  assert(isConjunctionDisjunctionTree(Val, 0));
+  // We're at a tree leaf, produce a c?f?cmp.
+  unsigned Opcode = Val->getOpcode();
+  if (Opcode == ISD::SETCC) {
+    SDValue LHS = Val->getOperand(0);
+    SDValue RHS = Val->getOperand(1);
+    ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
+    bool isInteger = LHS.getValueType().isInteger();
+    if (Negate)
+      CC = getSetCCInverse(CC, isInteger);
+    SDLoc DL(Val);
+    // Determine OutCC and handle FP special case.
+    if (isInteger) {
+      OutCC = changeIntCCToAArch64CC(CC);
+    } else {
+      assert(LHS.getValueType().isFloatingPoint());
+      AArch64CC::CondCode ExtraCC;
+      changeFPCCToAArch64CC(CC, OutCC, ExtraCC);
+      // Surpisingly some floating point conditions can't be tested with a
+      // single condition code. Construct an additional comparison in this case.
+      // See comment below on how we deal with OR conditions.
+      if (ExtraCC != AArch64CC::AL) {
+        SDValue ExtraCmp;
+        if (!CCOp.getNode())
+          ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
+        else {
+          SDValue ConditionOp = DAG.getConstant(Condition, DL, MVT_CC);
+          // Note that we want the inverse of ExtraCC, so NZCV is not inversed.
+          unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC);
+          ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp,
+                                               NZCV, DL, DAG);
+        }
+        CCOp = ExtraCmp;
+        Condition = AArch64CC::getInvertedCondCode(ExtraCC);
+        OutCC = AArch64CC::getInvertedCondCode(OutCC);
+      }
+    }
+
+    // Produce a normal comparison if we are first in the chain
+    if (!CCOp.getNode())
+      return emitComparison(LHS, RHS, CC, DL, DAG);
+    // Otherwise produce a ccmp.
+    SDValue ConditionOp = DAG.getConstant(Condition, DL, MVT_CC);
+    AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
+    unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
+    return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL,
+                                     DAG);
+  }
+
+  // Construct comparison sequence for the left hand side.
+  SDValue LHS = Val->getOperand(0);
+  SDValue RHS = Val->getOperand(1);
+
+  // We can only implement AND-like behaviour here, but negation is free. So we
+  // use (not (and (not x) (not y))) to implement (or x y).
+  bool isOr = Val->getOpcode() == ISD::OR;
+  assert((isOr || Val->getOpcode() == ISD::AND) && "Should have AND or OR.");
+  Negate ^= isOr;
+
+  AArch64CC::CondCode RHSCC;
+  SDValue CmpR =
+      emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, isOr, CCOp, Condition);
+  SDValue CmpL =
+      emitConjunctionDisjunctionTree(DAG, LHS, OutCC, isOr, CmpR, RHSCC);
+  if (Negate)
+    OutCC = AArch64CC::getInvertedCondCode(OutCC);
+  return CmpL;
+}
+
 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                              SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
   SDValue Cmp;
@@ -1227,47 +1356,55 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       }
     }
   }
-  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
-  // For the i8 operand, the largest immediate is 255, so this can be easily
-  // encoded in the compare instruction. For the i16 operand, however, the
-  // largest immediate cannot be encoded in the compare.
-  // Therefore, use a sign extending load and cmn to avoid materializing the -1
-  // constant. For example,
-  // movz w1, #65535
-  // ldrh w0, [x0, #0]
-  // cmp w0, w1
-  // >
-  // ldrsh w0, [x0, #0]
-  // cmn w0, #1
-  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
-  // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure
-  // both the LHS and RHS are truely zero extended and to make sure the
-  // transformation is profitable.
   if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
-    if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) &&
-        isa<LoadSDNode>(LHS)) {
-      if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
-          cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
-          LHS.getNode()->hasNUsesOfValue(1, 0)) {
-        int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
-        if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
-          SDValue SExt =
-              DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
-                          DAG.getValueType(MVT::i16));
-          Cmp = emitComparison(SExt,
-                               DAG.getConstant(ValueofRHS, dl,
-                                               RHS.getValueType()),
-                               CC, dl, DAG);
-          AArch64CC = changeIntCCToAArch64CC(CC);
-          AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32);
-          return Cmp;
-        }
+    const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
+
+    // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
+    // For the i8 operand, the largest immediate is 255, so this can be easily
+    // encoded in the compare instruction. For the i16 operand, however, the
+    // largest immediate cannot be encoded in the compare.
+    // Therefore, use a sign extending load and cmn to avoid materializing the
+    // -1 constant. For example,
+    // movz w1, #65535
+    // ldrh w0, [x0, #0]
+    // cmp w0, w1
+    // >
+    // ldrsh w0, [x0, #0]
+    // cmn w0, #1
+    // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
+    // if and only if (sext LHS) == (sext RHS). The checks are in place to
+    // ensure both the LHS and RHS are truely zero extended and to make sure the
+    // transformation is profitable.
+    if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
+        cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
+        cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
+        LHS.getNode()->hasNUsesOfValue(1, 0)) {
+      int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
+      if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
+        SDValue SExt =
+            DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
+                        DAG.getValueType(MVT::i16));
+        Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
+                                                   RHS.getValueType()),
+                             CC, dl, DAG);
+        AArch64CC = changeIntCCToAArch64CC(CC);
+        goto CreateCCNode;
       }
     }
+
+    if ((RHSC->isNullValue() || RHSC->isOne()) &&
+        isConjunctionDisjunctionTree(LHS, 0)) {
+      bool Negate = (CC == ISD::SETNE) ^ RHSC->isNullValue();
+      Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC, Negate);
+      goto CreateCCNode;
+    }
   }
+
   Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
   AArch64CC = changeIntCCToAArch64CC(CC);
-  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32);
+
+CreateCCNode:
+  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
   return Cmp;
 }
 
@@ -4065,7 +4202,8 @@ unsigned AArch64TargetLowering::getRegisterByName(const char* RegName,
                        .Default(0);
   if (Reg)
     return Reg;
-  report_fatal_error("Invalid register name global variable");
+  report_fatal_error(Twine("Invalid register name \""
+                              + StringRef(RegName)  + "\"."));
 }
 
 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
@@ -6741,7 +6879,8 @@ bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
 bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                  Type *Ty) const {
+                                                  Type *Ty,
+                                                  unsigned AS) const {
   // AArch64 has five basic addressing modes:
   //  reg
   //  reg + 9-bit signed offset
@@ -6792,7 +6931,8 @@ bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
 }
 
 int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
-                                                Type *Ty) const {
+                                                Type *Ty,
+                                                unsigned AS) const {
   // Scaling factors are not free at all.
   // Operands                     | Rt Latency
   // -------------------------------------------
@@ -6800,7 +6940,7 @@ int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
   // -------------------------------------------
   // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
   // Rt, [Xn, Wm, <extend> #imm]  |
-  if (isLegalAddressingMode(AM, Ty))
+  if (isLegalAddressingMode(AM, Ty, AS))
     // Scale represents reg2 * scale, thus account for 1 if
     // it is not equal to 0 or 1.
     return AM.Scale != 0 && AM.Scale != 1;
@@ -9120,3 +9260,8 @@ bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
   return Ty->isArrayTy();
 }
+
+bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
+                                                            EVT) const {
+  return false;
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 0d9b8b7c875e..db192c78169a 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -58,6 +58,11 @@ enum NodeType : unsigned {
   SBCS,
   ANDS,
 
+  // Conditional compares. Operands: left,right,falsecc,cc,flags
+  CCMP,
+  CCMN,
+  FCCMP,
+
   // Floating point comparison
   FCMP,
 
@@ -314,14 +319,16 @@ public:
 
   /// isLegalAddressingMode - Return true if the addressing mode represented
   /// by AM is legal for this target, for a load/store of the specified type.
-  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
+                             unsigned AS) const override;
 
   /// \brief Return the cost of the scaling factor used in the addressing
   /// mode represented by AM for this target, for a load/store
   /// of the specified type.
   /// If the AM is supported, the return value must be >= 0.
   /// If the AM is not supported, it returns a negative value.
-  int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override;
+  int getScalingFactorCost(const AddrMode &AM, Type *Ty,
+                           unsigned AS) const override;
 
   /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
   /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
@@ -506,6 +513,8 @@ private:
   bool functionArgumentNeedsConsecutiveRegisters(Type *Ty,
                                                  CallingConv::ID CallConv,
                                                  bool isVarArg) const override;
+
+  bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override;
 };
 
 namespace AArch64 {
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 3b8b6681a084..1fe9c7f8cc5a 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -525,6 +525,13 @@ def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
   let ParserMatchClass = Imm0_31Operand;
 }
 
+// True if the 32-bit immediate is in the range [0,31]
+def imm32_0_31 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint64_t)Imm) < 32;
+}]> {
+  let ParserMatchClass = Imm0_31Operand;
+}
+
 // imm0_15 predicate - True if the immediate is in the range [0,15]
 def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
   return ((uint64_t)Imm) < 16;
@@ -542,7 +549,9 @@ def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
 // imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15]
 def imm32_0_15 : Operand<i32>, ImmLeaf<i32, [{
   return ((uint32_t)Imm) < 16;
-}]>;
+}]> {
+  let ParserMatchClass = Imm0_15Operand;
+}
 
 // An arithmetic shifter operand:
 //  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
@@ -2068,9 +2077,12 @@ multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic,
 //---
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
-    : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond),
-         asm, "\t$Rn, $imm, $nzcv, $cond", "", []>,
+class BaseCondComparisonImm<bit op, RegisterClass regtype, ImmLeaf immtype,
+                            string mnemonic, SDNode OpNode>
+    : I<(outs), (ins regtype:$Rn, immtype:$imm, imm32_0_15:$nzcv, ccode:$cond),
+         mnemonic, "\t$Rn, $imm, $nzcv, $cond", "",
+         [(set NZCV, (OpNode regtype:$Rn, immtype:$imm, (i32 imm:$nzcv),
+                             (i32 imm:$cond), NZCV))]>,
       Sched<[WriteI, ReadI]> {
   let Uses = [NZCV];
   let Defs = [NZCV];
@@ -2090,19 +2102,13 @@ class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
   let Inst{3-0}   = nzcv;
 }
 
-multiclass CondSetFlagsImm<bit op, string asm> {
-  def Wi : BaseCondSetFlagsImm<op, GPR32, asm> {
-    let Inst{31} = 0;
-  }
-  def Xi : BaseCondSetFlagsImm<op, GPR64, asm> {
-    let Inst{31} = 1;
-  }
-}
-
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
-    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
-         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+class BaseCondComparisonReg<bit op, RegisterClass regtype, string mnemonic,
+                            SDNode OpNode>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond),
+         mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "",
+         [(set NZCV, (OpNode regtype:$Rn, regtype:$Rm, (i32 imm:$nzcv),
+                             (i32 imm:$cond), NZCV))]>,
       Sched<[WriteI, ReadI, ReadI]> {
   let Uses = [NZCV];
   let Defs = [NZCV];
@@ -2122,11 +2128,19 @@ class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
   let Inst{3-0}   = nzcv;
 }
 
-multiclass CondSetFlagsReg<bit op, string asm> {
-  def Wr : BaseCondSetFlagsReg<op, GPR32, asm> {
+multiclass CondComparison<bit op, string mnemonic, SDNode OpNode> {
+  // immediate operand variants
+  def Wi : BaseCondComparisonImm<op, GPR32, imm32_0_31, mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xi : BaseCondComparisonImm<op, GPR64, imm0_31, mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+  // register operand variants
+  def Wr : BaseCondComparisonReg<op, GPR32, mnemonic, OpNode> {
     let Inst{31} = 0;
   }
-  def Xr : BaseCondSetFlagsReg<op, GPR64, asm> {
+  def Xr : BaseCondComparisonReg<op, GPR64, mnemonic, OpNode> {
     let Inst{31} = 1;
   }
 }
@@ -3934,11 +3948,14 @@ multiclass FPComparison<bit signalAllNans, string asm,
 //---
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseFPCondComparison<bit signalAllNans,
-                              RegisterClass regtype, string asm>
-    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
-         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+class BaseFPCondComparison<bit signalAllNans, RegisterClass regtype,
+                           string mnemonic, list<dag> pat>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond),
+         mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", pat>,
       Sched<[WriteFCmp]> {
+  let Uses = [NZCV];
+  let Defs = [NZCV];
+
   bits<5> Rn;
   bits<5> Rm;
   bits<4> nzcv;
@@ -3954,16 +3971,18 @@ class BaseFPCondComparison<bit signalAllNans,
   let Inst{3-0}   = nzcv;
 }
 
-multiclass FPCondComparison<bit signalAllNans, string asm> {
-  let Defs = [NZCV], Uses = [NZCV] in {
-  def Srr : BaseFPCondComparison<signalAllNans, FPR32, asm> {
+multiclass FPCondComparison<bit signalAllNans, string mnemonic,
+                            SDPatternOperator OpNode = null_frag> {
+  def Srr : BaseFPCondComparison<signalAllNans, FPR32, mnemonic,
+      [(set NZCV, (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm), (i32 imm:$nzcv),
+                          (i32 imm:$cond), NZCV))]> {
     let Inst{22} = 0;
   }
-
-  def Drr : BaseFPCondComparison<signalAllNans, FPR64, asm> {
+  def Drr : BaseFPCondComparison<signalAllNans, FPR64, mnemonic,
+      [(set NZCV, (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm), (i32 imm:$nzcv),
+                          (i32 imm:$cond), NZCV))]> {
     let Inst{22} = 1;
   }
-  } // Defs = [NZCV], Uses = [NZCV]
 }
 
 //---
@@ -8822,6 +8841,178 @@ class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
                [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
 } // end of 'let Predicates = [HasCrypto]'
 
+//----------------------------------------------------------------------------
+// v8.1 atomic instructions extension:
+// * CAS
+// * CASP
+// * SWP
+// * LDOPregister<OP>, and aliases STOPregister<OP>
+
+// Instruction encodings:
+//
+//      31 30|29  24|23|22|21|20 16|15|14  10|9 5|4 0
+// CAS  SZ   |001000|1 |A |1 |Rs   |R |11111 |Rn |Rt
+// CASP  0|SZ|001000|0 |A |1 |Rs   |R |11111 |Rn |Rt
+// SWP  SZ   |111000|A |R |1 |Rs   |1 |OPC|00|Rn |Rt
+// LD   SZ   |111000|A |R |1 |Rs   |0 |OPC|00|Rn |Rt
+// ST   SZ   |111000|A |R |1 |Rs   |0 |OPC|00|Rn |11111
+
+// Instruction syntax:
+//
+// CAS{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>]
+// CAS{<order>} <Xs>, <Xt>, [<Xn|SP>]
+// CASP{<order>} <Ws>, <W(s+1)>, <Wt>, <W(t+1)>, [<Xn|SP>]
+// CASP{<order>} <Xs>, <X(s+1)>, <Xt>, <X(t+1)>, [<Xn|SP>]
+// SWP{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>]
+// SWP{<order>} <Xs>, <Xt>, [<Xn|SP>]
+// LD<OP>{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>]
+// LD<OP>{<order>} <Xs>, <Xt>, [<Xn|SP>]
+// ST<OP>{<order>}[<size>] <Ws>, [<Xn|SP>]
+// ST<OP>{<order>} <Xs>, [<Xn|SP>]
+
+let Predicates = [HasV8_1a], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
+class BaseCASEncoding<dag oops, dag iops, string asm, string operands,
+                      string cstr, list<dag> pattern>
+      : I<oops, iops, asm, operands, cstr, pattern> {
+  bits<2> Sz;
+  bit NP;
+  bit Acq;
+  bit Rel;
+  bits<5> Rs;
+  bits<5> Rn;
+  bits<5> Rt;
+  let Inst{31-30} = Sz;
+  let Inst{29-24} = 0b001000;
+  let Inst{23} = NP;
+  let Inst{22} = Acq;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rs;
+  let Inst{15} = Rel;
+  let Inst{14-10} = 0b11111;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+}
+
+class BaseCAS<string order, string size, RegisterClass RC>
+      : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
+                        "cas" # order # size, "\t$Rs, $Rt, [$Rn]",
+                        "$out = $Rs",[]> {
+  let NP = 1;
+}
+
+multiclass CompareAndSwap<bits<1> Acq, bits<1> Rel, string order> {
+  let Sz = 0b00, Acq = Acq, Rel = Rel in def b : BaseCAS<order, "b", GPR32>;
+  let Sz = 0b01, Acq = Acq, Rel = Rel in def h : BaseCAS<order, "h", GPR32>;
+  let Sz = 0b10, Acq = Acq, Rel = Rel in def s : BaseCAS<order, "", GPR32>;
+  let Sz = 0b11, Acq = Acq, Rel = Rel in def d : BaseCAS<order, "", GPR64>;
+}
+
+class BaseCASP<string order, string size, RegisterOperand RC>
+      : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
+                        "casp" # order # size, "\t$Rs, $Rt, [$Rn]",
+                        "$out = $Rs",[]> {
+  let NP = 0;
+}
+
+multiclass CompareAndSwapPair<bits<1> Acq, bits<1> Rel, string order> {
+  let Sz = 0b00, Acq = Acq, Rel = Rel in 
+    def s : BaseCASP<order, "", WSeqPairClassOperand>;
+  let Sz = 0b01, Acq = Acq, Rel = Rel in 
+    def d : BaseCASP<order, "", XSeqPairClassOperand>;
+}
+
+let Predicates = [HasV8_1a] in
+class BaseSWP<string order, string size, RegisterClass RC>
+      : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "swp" # order # size,
+          "\t$Rs, $Rt, [$Rn]","",[]> {
+  bits<2> Sz;
+  bit Acq;
+  bit Rel;
+  bits<5> Rs;
+  bits<3> opc = 0b000;
+  bits<5> Rn;
+  bits<5> Rt;
+  let Inst{31-30} = Sz;
+  let Inst{29-24} = 0b111000;
+  let Inst{23} = Acq;
+  let Inst{22} = Rel;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rs;
+  let Inst{15} = 0b1;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+}
+
+multiclass Swap<bits<1> Acq, bits<1> Rel, string order> {
+  let Sz = 0b00, Acq = Acq, Rel = Rel in def b : BaseSWP<order, "b", GPR32>;
+  let Sz = 0b01, Acq = Acq, Rel = Rel in def h : BaseSWP<order, "h", GPR32>;
+  let Sz = 0b10, Acq = Acq, Rel = Rel in def s : BaseSWP<order, "", GPR32>;
+  let Sz = 0b11, Acq = Acq, Rel = Rel in def d : BaseSWP<order, "", GPR64>;
+}
+
+let Predicates = [HasV8_1a], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
+class BaseLDOPregister<string op, string order, string size, RegisterClass RC>
+      : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "ld" # op # order # size,
+          "\t$Rs, $Rt, [$Rn]","",[]> {
+  bits<2> Sz;
+  bit Acq;
+  bit Rel;
+  bits<5> Rs;
+  bits<3> opc;
+  bits<5> Rn;
+  bits<5> Rt;
+  let Inst{31-30} = Sz;
+  let Inst{29-24} = 0b111000;
+  let Inst{23} = Acq;
+  let Inst{22} = Rel;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rs;
+  let Inst{15} = 0b0;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+}
+
+multiclass LDOPregister<bits<3> opc, string op, bits<1> Acq, bits<1> Rel, 
+                        string order> {
+  let Sz = 0b00, Acq = Acq, Rel = Rel, opc = opc in 
+    def b : BaseLDOPregister<op, order, "b", GPR32>;
+  let Sz = 0b01, Acq = Acq, Rel = Rel, opc = opc in 
+    def h : BaseLDOPregister<op, order, "h", GPR32>;
+  let Sz = 0b10, Acq = Acq, Rel = Rel, opc = opc in 
+    def s : BaseLDOPregister<op, order, "", GPR32>;
+  let Sz = 0b11, Acq = Acq, Rel = Rel, opc = opc in 
+    def d : BaseLDOPregister<op, order, "", GPR64>;
+}
+
+let Predicates = [HasV8_1a] in
+class BaseSTOPregister<string asm, RegisterClass OP, Register Reg,
+                        Instruction inst> :
+      InstAlias<asm # "\t$Rs, [$Rn]", (inst Reg, OP:$Rs, GPR64sp:$Rn)>;
+
+multiclass STOPregister<string asm, string instr> {
+  def : BaseSTOPregister<asm # "lb", GPR32, WZR, 
+                    !cast<Instruction>(instr # "Lb")>;
+  def : BaseSTOPregister<asm # "lh", GPR32, WZR, 
+                    !cast<Instruction>(instr # "Lh")>;
+  def : BaseSTOPregister<asm # "l",  GPR32, WZR, 
+                    !cast<Instruction>(instr # "Ls")>;
+  def : BaseSTOPregister<asm # "l",  GPR64, XZR, 
+                    !cast<Instruction>(instr # "Ld")>;
+  def : BaseSTOPregister<asm # "b",  GPR32, WZR, 
+                    !cast<Instruction>(instr # "b")>;
+  def : BaseSTOPregister<asm # "h",  GPR32, WZR, 
+                    !cast<Instruction>(instr # "h")>;
+  def : BaseSTOPregister<asm,        GPR32, WZR, 
+                    !cast<Instruction>(instr # "s")>;
+  def : BaseSTOPregister<asm,        GPR64, XZR, 
+                    !cast<Instruction>(instr # "d")>;
+}
+
+//----------------------------------------------------------------------------
 // Allow the size specifier tokens to be upper case, not just lower.
 def : TokenAlias<".8B", ".8b">;
 def : TokenAlias<".4H", ".4h">;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index 207c34ca7f0b..6941a6bf1b47 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2066,10 +2066,9 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
       .setMIFlag(Flag);
 }
 
-MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                                      MachineInstr *MI,
-                                                      ArrayRef<unsigned> Ops,
-                                                      int FrameIndex) const {
+MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
   // This is a bit of a hack. Consider this instruction:
   //
   //   %vreg0<def> = COPY %SP; GPR64all:%vreg0
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index fa4b8b7e6179..d296768ab9b0 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -131,6 +131,7 @@ public:
   using TargetInstrInfo::foldMemoryOperandImpl;
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
                                       ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
                                       int FrameIndex) const override;
 
   bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index c7d6a69b9fd7..2f1b8933bf61 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -66,6 +66,20 @@ def SDT_AArch64CSel  : SDTypeProfile<1, 4,
                                     SDTCisSameAs<0, 2>,
                                     SDTCisInt<3>,
                                     SDTCisVT<4, i32>]>;
+def SDT_AArch64CCMP : SDTypeProfile<1, 5,
+                                    [SDTCisVT<0, i32>,
+                                     SDTCisInt<1>,
+                                     SDTCisSameAs<1, 2>,
+                                     SDTCisInt<3>,
+                                     SDTCisInt<4>,
+                                     SDTCisVT<5, i32>]>;
+def SDT_AArch64FCCMP : SDTypeProfile<1, 5,
+                                     [SDTCisVT<0, i32>,
+                                      SDTCisFP<1>,
+                                      SDTCisSameAs<1, 2>,
+                                      SDTCisInt<3>,
+                                      SDTCisInt<4>,
+                                      SDTCisVT<5, i32>]>;
 def SDT_AArch64FCmp   : SDTypeProfile<0, 2,
                                    [SDTCisFP<0>,
                                     SDTCisSameAs<0, 1>]>;
@@ -160,6 +174,10 @@ def AArch64and_flag  : SDNode<"AArch64ISD::ANDS",  SDTBinaryArithWithFlagsOut,
 def AArch64adc_flag  : SDNode<"AArch64ISD::ADCS",  SDTBinaryArithWithFlagsInOut>;
 def AArch64sbc_flag  : SDNode<"AArch64ISD::SBCS",  SDTBinaryArithWithFlagsInOut>;
 
+def AArch64ccmp      : SDNode<"AArch64ISD::CCMP",  SDT_AArch64CCMP>;
+def AArch64ccmn      : SDNode<"AArch64ISD::CCMN",  SDT_AArch64CCMP>;
+def AArch64fccmp     : SDNode<"AArch64ISD::FCCMP", SDT_AArch64FCCMP>;
+
 def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
 
 def AArch64fcmp      : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;
@@ -727,6 +745,74 @@ def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_aarch64_crc32ch, "crc32ch">;
 def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_aarch64_crc32cw, "crc32cw">;
 def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_aarch64_crc32cx, "crc32cx">;
 
+// v8.1 atomic CAS
+defm CAS   : CompareAndSwap<0, 0, "">;
+defm CASA  : CompareAndSwap<1, 0, "a">;
+defm CASL  : CompareAndSwap<0, 1, "l">;
+defm CASAL : CompareAndSwap<1, 1, "al">;
+
+// v8.1 atomic CASP
+defm CASP   : CompareAndSwapPair<0, 0, "">;
+defm CASPA  : CompareAndSwapPair<1, 0, "a">;
+defm CASPL  : CompareAndSwapPair<0, 1, "l">;
+defm CASPAL : CompareAndSwapPair<1, 1, "al">;
+
+// v8.1 atomic SWP
+defm SWP   : Swap<0, 0, "">;
+defm SWPA  : Swap<1, 0, "a">;
+defm SWPL  : Swap<0, 1, "l">;
+defm SWPAL : Swap<1, 1, "al">;
+
+// v8.1 atomic LD<OP>(register). Performs load and then ST<OP>(register)
+defm LDADD   : LDOPregister<0b000, "add", 0, 0, "">;
+defm LDADDA  : LDOPregister<0b000, "add", 1, 0, "a">;
+defm LDADDL  : LDOPregister<0b000, "add", 0, 1, "l">;
+defm LDADDAL : LDOPregister<0b000, "add", 1, 1, "al">;
+
+defm LDCLR   : LDOPregister<0b001, "clr", 0, 0, "">;
+defm LDCLRA  : LDOPregister<0b001, "clr", 1, 0, "a">;
+defm LDCLRL  : LDOPregister<0b001, "clr", 0, 1, "l">;
+defm LDCLRAL : LDOPregister<0b001, "clr", 1, 1, "al">;
+
+defm LDEOR   : LDOPregister<0b010, "eor", 0, 0, "">;
+defm LDEORA  : LDOPregister<0b010, "eor", 1, 0, "a">;
+defm LDEORL  : LDOPregister<0b010, "eor", 0, 1, "l">;
+defm LDEORAL : LDOPregister<0b010, "eor", 1, 1, "al">;
+
+defm LDSET   : LDOPregister<0b011, "set", 0, 0, "">;
+defm LDSETA  : LDOPregister<0b011, "set", 1, 0, "a">;
+defm LDSETL  : LDOPregister<0b011, "set", 0, 1, "l">;
+defm LDSETAL : LDOPregister<0b011, "set", 1, 1, "al">;
+
+defm LDSMAX   : LDOPregister<0b100, "smax", 0, 0, "">;
+defm LDSMAXA  : LDOPregister<0b100, "smax", 1, 0, "a">;
+defm LDSMAXL  : LDOPregister<0b100, "smax", 0, 1, "l">;
+defm LDSMAXAL : LDOPregister<0b100, "smax", 1, 1, "al">;
+
+defm LDSMIN   : LDOPregister<0b101, "smin", 0, 0, "">;
+defm LDSMINA  : LDOPregister<0b101, "smin", 1, 0, "a">;
+defm LDSMINL  : LDOPregister<0b101, "smin", 0, 1, "l">;
+defm LDSMINAL : LDOPregister<0b101, "smin", 1, 1, "al">;
+
+defm LDUMAX   : LDOPregister<0b110, "umax", 0, 0, "">;
+defm LDUMAXA  : LDOPregister<0b110, "umax", 1, 0, "a">;
+defm LDUMAXL  : LDOPregister<0b110, "umax", 0, 1, "l">;
+defm LDUMAXAL : LDOPregister<0b110, "umax", 1, 1, "al">;
+
+defm LDUMIN   : LDOPregister<0b111, "umin", 0, 0, "">;
+defm LDUMINA  : LDOPregister<0b111, "umin", 1, 0, "a">;
+defm LDUMINL  : LDOPregister<0b111, "umin", 0, 1, "l">;
+defm LDUMINAL : LDOPregister<0b111, "umin", 1, 1, "al">;
+
+// v8.1 atomic ST<OP>(register) as aliases to "LD<OP>(register) when Rt=xZR"
+defm : STOPregister<"stadd","LDADD">; // STADDx
+defm : STOPregister<"stclr","LDCLR">; // STCLRx
+defm : STOPregister<"steor","LDEOR">; // STEORx
+defm : STOPregister<"stset","LDSET">; // STSETx
+defm : STOPregister<"stsmax","LDSMAX">;// STSMAXx
+defm : STOPregister<"stsmin","LDSMIN">;// STSMINx
+defm : STOPregister<"stumax","LDUMAX">;// STUMAXx
+defm : STOPregister<"stumin","LDUMIN">;// STUMINx
 
 //===----------------------------------------------------------------------===//
 // Logical instructions.
@@ -950,13 +1036,10 @@ def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
 def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
 
 //===----------------------------------------------------------------------===//
-// Conditionally set flags instructions.
+// Conditional comparison instructions.
 //===----------------------------------------------------------------------===//
-defm CCMN : CondSetFlagsImm<0, "ccmn">;
-defm CCMP : CondSetFlagsImm<1, "ccmp">;
-
-defm CCMN : CondSetFlagsReg<0, "ccmn">;
-defm CCMP : CondSetFlagsReg<1, "ccmp">;
+defm CCMN : CondComparison<0, "ccmn", AArch64ccmn>;
+defm CCMP : CondComparison<1, "ccmp", AArch64ccmp>;
 
 //===----------------------------------------------------------------------===//
 // Conditional select instructions.
@@ -2486,7 +2569,7 @@ defm FCMP  : FPComparison<0, "fcmp", AArch64fcmp>;
 //===----------------------------------------------------------------------===//
 
 defm FCCMPE : FPCondComparison<1, "fccmpe">;
-defm FCCMP  : FPCondComparison<0, "fccmp">;
+defm FCCMP  : FPCondComparison<0, "fccmp", AArch64fccmp>;
 
 //===----------------------------------------------------------------------===//
 // Floating point conditional select instruction.
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index 72edbf14c0d8..e55ae991b635 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -69,10 +69,10 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO,
              AArch64II::MO_PAGEOFF)
       RefKind = MCSymbolRefExpr::VK_PAGEOFF;
   }
-  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
+  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);
   if (!MO.isJTI() && MO.getOffset())
-    Expr = MCBinaryExpr::CreateAdd(
-        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
+    Expr = MCBinaryExpr::createAdd(
+        Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
   return MCOperand::createExpr(Expr);
 }
 
@@ -139,14 +139,14 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
     RefFlags |= AArch64MCExpr::VK_NC;
 
   const MCExpr *Expr =
-      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, Ctx);
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx);
   if (!MO.isJTI() && MO.getOffset())
-    Expr = MCBinaryExpr::CreateAdd(
-        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
+    Expr = MCBinaryExpr::createAdd(
+        Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
 
   AArch64MCExpr::VariantKind RefKind;
   RefKind = static_cast<AArch64MCExpr::VariantKind>(RefFlags);
-  Expr = AArch64MCExpr::Create(Expr, RefKind, Ctx);
+  Expr = AArch64MCExpr::create(Expr, RefKind, Ctx);
 
   return MCOperand::createExpr(Expr);
 }
@@ -179,7 +179,7 @@ bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO,
     break;
   case MachineOperand::MO_MachineBasicBlock:
     MCOp = MCOperand::createExpr(
-        MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(), Ctx));
+        MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
     break;
   case MachineOperand::MO_GlobalAddress:
     MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index d5ff3f1f3373..b2efca023372 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -26,8 +26,12 @@ let Namespace = "AArch64" in {
   def hsub : SubRegIndex<16>;
   def ssub : SubRegIndex<32>;
   def dsub : SubRegIndex<32>;
+  def sube32 : SubRegIndex<32>;
+  def subo32 : SubRegIndex<32>;
   def qhisub : SubRegIndex<64>;
   def qsub : SubRegIndex<64>;
+  def sube64 : SubRegIndex<64>;
+  def subo64 : SubRegIndex<64>;
   // Note: Code depends on these having consecutive numbers
   def dsub0 : SubRegIndex<64>;
   def dsub1 : SubRegIndex<64>;
@@ -592,3 +596,40 @@ def FPR16Op : RegisterOperand<FPR16, "printOperand">;
 def FPR32Op : RegisterOperand<FPR32, "printOperand">;
 def FPR64Op : RegisterOperand<FPR64, "printOperand">;
 def FPR128Op : RegisterOperand<FPR128, "printOperand">;
+
+
+//===----------------------------------------------------------------------===//
+// ARMv8.1a atomic CASP register operands
+
+
+def WSeqPairs : RegisterTuples<[sube32, subo32], 
+                               [(rotl GPR32, 0), (rotl GPR32, 1)]>;
+def XSeqPairs : RegisterTuples<[sube64, subo64], 
+                               [(rotl GPR64, 0), (rotl GPR64, 1)]>;
+
+def WSeqPairsClass   : RegisterClass<"AArch64", [untyped], 32, 
+                                     (add WSeqPairs)>{
+  let Size = 64;
+}
+def XSeqPairsClass   : RegisterClass<"AArch64", [untyped], 64, 
+                                     (add XSeqPairs)>{
+  let Size = 128;
+}
+
+
+let RenderMethod = "addRegOperands", ParserMethod="tryParseGPRSeqPair" in {
+  def WSeqPairsAsmOperandClass : AsmOperandClass { let Name = "WSeqPair"; }
+  def XSeqPairsAsmOperandClass : AsmOperandClass { let Name = "XSeqPair"; }
+}
+
+def WSeqPairClassOperand :
+    RegisterOperand<WSeqPairsClass, "printGPRSeqPairsClassOperand<32>"> {
+  let ParserMatchClass = WSeqPairsAsmOperandClass;
+}
+def XSeqPairClassOperand :
+    RegisterOperand<XSeqPairsClass, "printGPRSeqPairsClassOperand<64>"> {
+  let ParserMatchClass = XSeqPairsAsmOperandClass;
+}
+
+
+//===----- END: v8.1a atomic CASP register operands -----------------------===//
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index a9059ab37f5f..f23dd33d0146 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -250,10 +250,14 @@ bool AArch64PassConfig::addPreISel() {
   // FIXME: On AArch64, this depends on the type.
   // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes().
   // and the offset has to be a multiple of the related size in bytes.
-  if ((TM->getOptLevel() == CodeGenOpt::Aggressive &&
+  if ((TM->getOptLevel() != CodeGenOpt::None &&
        EnableGlobalMerge == cl::BOU_UNSET) ||
-      EnableGlobalMerge == cl::BOU_TRUE)
-    addPass(createGlobalMergePass(TM, 4095));
+      EnableGlobalMerge == cl::BOU_TRUE) {
+    bool OnlyOptimizeForSize = (TM->getOptLevel() < CodeGenOpt::Aggressive) &&
+                               (EnableGlobalMerge == cl::BOU_UNSET);
+    addPass(createGlobalMergePass(TM, 4095, OnlyOptimizeForSize));
+  }
+
   if (TM->getOptLevel() != CodeGenOpt::None)
     addPass(createAArch64AddressTypePromotionPass());
 
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index 299b4a55dd82..18ee4a9c72b5 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -40,11 +40,11 @@ const MCExpr *AArch64_MachoTargetObjectFile::getTTypeGlobalReference(
   if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
     const MCSymbol *Sym = TM.getSymbol(GV, Mang);
     const MCExpr *Res =
-        MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
+        MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
     MCSymbol *PCSym = getContext().createTempSymbol();
     Streamer.EmitLabel(PCSym);
-    const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext());
-    return MCBinaryExpr::CreateSub(Res, PC, getContext());
+    const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
+    return MCBinaryExpr::createSub(Res, PC, getContext());
   }
 
   return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
@@ -65,9 +65,9 @@ const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
   // On ARM64 Darwin, we can reference symbols with foo@GOT-., which
   // is an indirect pc-relative reference.
   const MCExpr *Res =
-      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
   MCSymbol *PCSym = getContext().createTempSymbol();
   Streamer.EmitLabel(PCSym);
-  const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext());
-  return MCBinaryExpr::CreateSub(Res, PC, getContext());
+  const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
+  return MCBinaryExpr::createSub(Res, PC, getContext());
 }
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 38d34e65a2e4..063c053ffe8a 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -107,6 +107,7 @@ private:
   OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands);
   OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands);
   bool tryParseVectorRegister(OperandVector &Operands);
+  OperandMatchResultTy tryParseGPRSeqPair(OperandVector &Operands);
 
 public:
   enum AArch64MatchResultTy {
@@ -875,6 +876,16 @@ public:
     return Kind == k_Register && !Reg.isVector &&
       AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum);
   }
+  bool isWSeqPair() const {
+    return Kind == k_Register && !Reg.isVector &&
+           AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains(
+               Reg.RegNum);
+  }
+  bool isXSeqPair() const {
+    return Kind == k_Register && !Reg.isVector &&
+           AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID].contains(
+               Reg.RegNum);
+  }
 
   bool isGPR64sp0() const {
     return Kind == k_Register && !Reg.isVector &&
@@ -1753,7 +1764,7 @@ static unsigned MatchRegisterName(StringRef Name);
 /// }
 
 static unsigned matchVectorRegName(StringRef Name) {
-  return StringSwitch<unsigned>(Name)
+  return StringSwitch<unsigned>(Name.lower())
       .Case("v0", AArch64::Q0)
       .Case("v1", AArch64::Q1)
       .Case("v2", AArch64::Q2)
@@ -2024,7 +2035,7 @@ AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
       // No modifier was specified at all; this is the syntax for an ELF basic
       // ADRP relocation (unfortunately).
       Expr =
-          AArch64MCExpr::Create(Expr, AArch64MCExpr::VK_ABS_PAGE, getContext());
+          AArch64MCExpr::create(Expr, AArch64MCExpr::VK_ABS_PAGE, getContext());
     } else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE ||
                 DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) &&
                Addend != 0) {
@@ -2157,7 +2168,7 @@ AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
     if (MCE) {
       int64_t Val = MCE->getValue();
       if (Val > 0xfff && (Val & 0xfff) == 0) {
-        Imm = MCConstantExpr::Create(Val >> 12, getContext());
+        Imm = MCConstantExpr::create(Val >> 12, getContext());
         ShiftAmount = 12;
       }
     }
@@ -2347,14 +2358,14 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
 
 #define SYS_ALIAS(op1, Cn, Cm, op2)                                            \
   do {                                                                         \
-    Expr = MCConstantExpr::Create(op1, getContext());                          \
+    Expr = MCConstantExpr::create(op1, getContext());                          \
     Operands.push_back(                                                        \
         AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
     Operands.push_back(                                                        \
         AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));           \
     Operands.push_back(                                                        \
         AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));           \
-    Expr = MCConstantExpr::Create(op2, getContext());                          \
+    Expr = MCConstantExpr::create(op2, getContext());                          \
     Operands.push_back(                                                        \
         AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
   } while (0)
@@ -2835,7 +2846,7 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
     return true;
 
   if (HasELFModifier)
-    ImmVal = AArch64MCExpr::Create(ImmVal, RefKind, getContext());
+    ImmVal = AArch64MCExpr::create(ImmVal, RefKind, getContext());
 
   return false;
 }
@@ -3128,7 +3139,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
       if (ShiftAmt <= MaxShiftAmt && Imm <= 0xFFFF) {
           Operands[0] = AArch64Operand::CreateToken("movz", false, Loc, Ctx);
           Operands.push_back(AArch64Operand::CreateImm(
-                     MCConstantExpr::Create(Imm, Ctx), S, E, Ctx));
+                     MCConstantExpr::create(Imm, Ctx), S, E, Ctx));
         if (ShiftAmt)
           Operands.push_back(AArch64Operand::CreateShiftExtend(AArch64_AM::LSL,
                      ShiftAmt, true, S, E, Ctx));
@@ -3634,8 +3645,8 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
           NewOp4Val = 63 - Op3Val;
         }
 
-        const MCExpr *NewOp3 = MCConstantExpr::Create(NewOp3Val, getContext());
-        const MCExpr *NewOp4 = MCConstantExpr::Create(NewOp4Val, getContext());
+        const MCExpr *NewOp3 = MCConstantExpr::create(NewOp3Val, getContext());
+        const MCExpr *NewOp4 = MCConstantExpr::create(NewOp4Val, getContext());
 
         Operands[0] = AArch64Operand::CreateToken(
             "ubfm", false, Op.getStartLoc(), getContext());
@@ -3685,8 +3696,8 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
           return Error(WidthOp.getStartLoc(),
                        "requested insert overflows register");
 
-        const MCExpr *ImmRExpr = MCConstantExpr::Create(ImmR, getContext());
-        const MCExpr *ImmSExpr = MCConstantExpr::Create(ImmS, getContext());
+        const MCExpr *ImmRExpr = MCConstantExpr::create(ImmR, getContext());
+        const MCExpr *ImmSExpr = MCConstantExpr::create(ImmS, getContext());
         Operands[0] = AArch64Operand::CreateToken(
               "bfm", false, Op.getStartLoc(), getContext());
         Operands[2] = AArch64Operand::CreateReg(
@@ -3742,9 +3753,9 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                          "requested insert overflows register");
 
           const MCExpr *NewOp3 =
-              MCConstantExpr::Create(NewOp3Val, getContext());
+              MCConstantExpr::create(NewOp3Val, getContext());
           const MCExpr *NewOp4 =
-              MCConstantExpr::Create(NewOp4Val, getContext());
+              MCConstantExpr::create(NewOp4Val, getContext());
           Operands[3] = AArch64Operand::CreateImm(
               NewOp3, Op3.getStartLoc(), Op3.getEndLoc(), getContext());
           Operands[4] = AArch64Operand::CreateImm(
@@ -3800,7 +3811,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                          "requested extract overflows register");
 
           const MCExpr *NewOp4 =
-              MCConstantExpr::Create(NewOp4Val, getContext());
+              MCConstantExpr::create(NewOp4Val, getContext());
           Operands[4] = AArch64Operand::CreateImm(
               NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext());
           if (Tok == "bfxil")
@@ -4021,7 +4032,7 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
   if (IDVal == ".ltorg" || IDVal == ".pool")
     return parseDirectiveLtorg(Loc);
   if (IDVal == ".unreq")
-    return parseDirectiveUnreq(DirectiveID.getLoc());
+    return parseDirectiveUnreq(Loc);
 
   if (!IsMachO && !IsCOFF) {
     if (IDVal == ".inst")
@@ -4106,8 +4117,8 @@ bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
     return Error(L, "expected symbol after directive");
 
   MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
-  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext());
-  Expr = AArch64MCExpr::Create(Expr, AArch64MCExpr::VK_TLSDESC, getContext());
+  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
+  Expr = AArch64MCExpr::create(Expr, AArch64MCExpr::VK_TLSDESC, getContext());
 
   MCInst Inst;
   Inst.setOpcode(AArch64::TLSDESCCALL);
@@ -4354,3 +4365,77 @@ unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
     return Match_Success;
   return Match_InvalidOperand;
 }
+
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {
+
+  SMLoc S = getLoc();
+
+  if (getParser().getTok().isNot(AsmToken::Identifier)) {
+    Error(S, "expected register");
+    return MatchOperand_ParseFail;
+  }
+
+  int FirstReg = tryParseRegister();
+  if (FirstReg == -1) {
+    return MatchOperand_ParseFail;
+  }
+  const MCRegisterClass &WRegClass =
+      AArch64MCRegisterClasses[AArch64::GPR32RegClassID];
+  const MCRegisterClass &XRegClass =
+      AArch64MCRegisterClasses[AArch64::GPR64RegClassID];
+
+  bool isXReg = XRegClass.contains(FirstReg),
+       isWReg = WRegClass.contains(FirstReg);
+  if (!isXReg && !isWReg) {
+    Error(S, "expected first even register of a "
+             "consecutive same-size even/odd register pair");
+    return MatchOperand_ParseFail;
+  }
+
+  const MCRegisterInfo *RI = getContext().getRegisterInfo();
+  unsigned FirstEncoding = RI->getEncodingValue(FirstReg);
+
+  if (FirstEncoding & 0x1) {
+    Error(S, "expected first even register of a "
+             "consecutive same-size even/odd register pair");
+    return MatchOperand_ParseFail;
+  }
+
+  SMLoc M = getLoc();
+  if (getParser().getTok().isNot(AsmToken::Comma)) {
+    Error(M, "expected comma");
+    return MatchOperand_ParseFail;
+  }
+  // Eat the comma
+  getParser().Lex();
+
+  SMLoc E = getLoc();
+  int SecondReg = tryParseRegister();
+  if (SecondReg ==-1) {
+    return MatchOperand_ParseFail;
+  }
+
+ if (RI->getEncodingValue(SecondReg) != FirstEncoding + 1 ||
+      (isXReg && !XRegClass.contains(SecondReg)) ||
+      (isWReg && !WRegClass.contains(SecondReg))) {
+    Error(E,"expected second odd register of a "
+             "consecutive same-size even/odd register pair");
+    return MatchOperand_ParseFail;
+  }
+  
+  unsigned Pair = 0;
+  if(isXReg) {
+    Pair = RI->getMatchingSuperReg(FirstReg, AArch64::sube64,
+           &AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID]);
+  } else {
+    Pair = RI->getMatchingSuperReg(FirstReg, AArch64::sube32,
+           &AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID]);
+  }
+
+  Operands.push_back(AArch64Operand::CreateReg(Pair, false, S, getLoc(),
+      getContext()));
+
+  return MatchOperand_Success;
+}
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index a1ed703d1bf4..359c2e734e21 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -169,6 +169,14 @@ static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
                                          uint64_t Addr, const void *Decoder);
 static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst,
+                                                      unsigned RegNo,
+                                                      uint64_t Addr,
+                                                      const void *Decoder);
+static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst,
+                                                      unsigned RegNo,
+                                                      uint64_t Addr,
+                                                      const void *Decoder);
 
 static bool Check(DecodeStatus &Out, DecodeStatus In) {
   switch (In) {
@@ -1543,3 +1551,35 @@ static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
 
   return Success;
 }
+
+static DecodeStatus DecodeGPRSeqPairsClassRegisterClass(MCInst &Inst,
+                                                        unsigned RegClassID,
+                                                        unsigned RegNo,
+                                                        uint64_t Addr,
+                                                        const void *Decoder) {
+  // Register number must be even (see CASP instruction)
+  if (RegNo & 0x1)
+    return Fail;
+
+  unsigned Register = AArch64MCRegisterClasses[RegClassID].getRegister(RegNo);
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
+static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst,
+                                                      unsigned RegNo,
+                                                      uint64_t Addr,
+                                                      const void *Decoder) {
+  return DecodeGPRSeqPairsClassRegisterClass(Inst, 
+                                             AArch64::WSeqPairsClassRegClassID,
+                                             RegNo, Addr, Decoder);
+}
+
+static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst,
+                                                      unsigned RegNo,
+                                                      uint64_t Addr,
+                                                      const void *Decoder) {
+  return DecodeGPRSeqPairsClassRegisterClass(Inst, 
+                                             AArch64::XSeqPairsClassRegClassID,
+                                             RegNo, Addr, Decoder);
+}
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
index 07e4a45292fa..eb05ed915ddb 100644
--- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -168,11 +168,11 @@ bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand(
       MCSymbol *Sym = Ctx.getOrCreateSymbol(Name);
       MCSymbolRefExpr::VariantKind Variant = getVariant(SymbolicOp.VariantKind);
       if (Variant != MCSymbolRefExpr::VK_None)
-        Add = MCSymbolRefExpr::Create(Sym, Variant, Ctx);
+        Add = MCSymbolRefExpr::create(Sym, Variant, Ctx);
       else
-        Add = MCSymbolRefExpr::Create(Sym, Ctx);
+        Add = MCSymbolRefExpr::create(Sym, Ctx);
     } else {
-      Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, Ctx);
+      Add = MCConstantExpr::create(SymbolicOp.AddSymbol.Value, Ctx);
     }
   }
 
@@ -181,37 +181,37 @@ bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand(
     if (SymbolicOp.SubtractSymbol.Name) {
       StringRef Name(SymbolicOp.SubtractSymbol.Name);
       MCSymbol *Sym = Ctx.getOrCreateSymbol(Name);
-      Sub = MCSymbolRefExpr::Create(Sym, Ctx);
+      Sub = MCSymbolRefExpr::create(Sym, Ctx);
     } else {
-      Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, Ctx);
+      Sub = MCConstantExpr::create(SymbolicOp.SubtractSymbol.Value, Ctx);
     }
   }
 
   const MCExpr *Off = nullptr;
   if (SymbolicOp.Value != 0)
-    Off = MCConstantExpr::Create(SymbolicOp.Value, Ctx);
+    Off = MCConstantExpr::create(SymbolicOp.Value, Ctx);
 
   const MCExpr *Expr;
   if (Sub) {
     const MCExpr *LHS;
     if (Add)
-      LHS = MCBinaryExpr::CreateSub(Add, Sub, Ctx);
+      LHS = MCBinaryExpr::createSub(Add, Sub, Ctx);
     else
-      LHS = MCUnaryExpr::CreateMinus(Sub, Ctx);
+      LHS = MCUnaryExpr::createMinus(Sub, Ctx);
     if (Off)
-      Expr = MCBinaryExpr::CreateAdd(LHS, Off, Ctx);
+      Expr = MCBinaryExpr::createAdd(LHS, Off, Ctx);
     else
       Expr = LHS;
   } else if (Add) {
     if (Off)
-      Expr = MCBinaryExpr::CreateAdd(Add, Off, Ctx);
+      Expr = MCBinaryExpr::createAdd(Add, Off, Ctx);
     else
       Expr = Add;
   } else {
     if (Off)
       Expr = Off;
     else
-      Expr = MCConstantExpr::Create(0, Ctx);
+      Expr = MCConstantExpr::create(0, Ctx);
   }
 
   MI.addOperand(MCOperand::createExpr(Expr));
diff --git a/lib/Target/AArch64/Disassembler/LLVMBuild.txt b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
index 62827e8f50eb..73665eb5701a 100644
--- a/lib/Target/AArch64/Disassembler/LLVMBuild.txt
+++ b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = AArch64Disassembler
 parent = AArch64
-required_libraries = AArch64Info AArch64Utils MC MCDisassembler Support
+required_libraries = AArch64Desc AArch64Info AArch64Utils MC MCDisassembler Support
 add_to_library_groups = AArch64
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index 02bd929dc65d..96fbe3a9af4d 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -206,15 +206,15 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     else
       O << "\tmovn\t";
 
-    O << getRegisterName(MI->getOperand(0).getReg()) << ", #"
-      << *MI->getOperand(1).getExpr();
+    O << getRegisterName(MI->getOperand(0).getReg()) << ", #";
+    MI->getOperand(1).getExpr()->print(O, &MAI);
     return;
   }
 
   if ((Opcode == AArch64::MOVKXi || Opcode == AArch64::MOVKWi) &&
       MI->getOperand(2).isExpr()) {
-    O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
-      << *MI->getOperand(2).getExpr();
+    O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #";
+    MI->getOperand(2).getExpr()->print(O, &MAI);
     return;
   }
 
@@ -908,7 +908,7 @@ void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     O << '#' << Op.getImm();
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
-    O << *Op.getExpr();
+    Op.getExpr()->print(O, &MAI);
   }
 }
 
@@ -966,7 +966,7 @@ void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
       *CommentStream << '=' << (Val << Shift) << '\n';
   } else {
     assert(MO.isExpr() && "Unexpected operand type!");
-    O << *MO.getExpr();
+    MO.getExpr()->print(O, &MAI);
     printShifter(MI, OpNum + 1, STI, O);
   }
 }
@@ -1091,7 +1091,7 @@ void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum,
     O << "#" << (MO.getImm() * Scale);
   } else {
     assert(MO.isExpr() && "Unexpected operand type!");
-    O << *MO.getExpr();
+    MO.getExpr()->print(O, &MAI);
   }
 }
 
@@ -1103,7 +1103,8 @@ void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum,
       O << ", #" << (MO1.getImm() * Scale);
   } else {
     assert(MO1.isExpr() && "Unexpected operand type!");
-    O << ", " << *MO1.getExpr();
+    O << ", ";
+    MO1.getExpr()->print(O, &MAI);
   }
   O << ']';
 }
@@ -1113,7 +1114,7 @@ void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
                                          raw_ostream &O) {
   unsigned prfop = MI->getOperand(OpNum).getImm();
   bool Valid;
-  StringRef Name = 
+  StringRef Name =
       AArch64PRFM::PRFMMapper().toString(prfop, STI.getFeatureBits(), Valid);
   if (Valid)
     O << Name;
@@ -1177,6 +1178,23 @@ static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) {
   return Reg;
 }
 
+template<unsigned size>
+void AArch64InstPrinter::printGPRSeqPairsClassOperand(const MCInst *MI,
+                                                   unsigned OpNum,
+                                                   const MCSubtargetInfo &STI,
+                                                   raw_ostream &O) {
+  static_assert(size == 64 || size == 32,
+                "Template parameter must be either 32 or 64");
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+
+  unsigned Sube = (size == 32) ? AArch64::sube32 : AArch64::sube64;
+  unsigned Subo = (size == 32) ? AArch64::subo32 : AArch64::subo64;
+
+  unsigned Even = MRI.getSubReg(Reg,  Sube);
+  unsigned Odd = MRI.getSubReg(Reg,  Subo);
+  O << getRegisterName(Even) << ", " << getRegisterName(Odd);
+}
+
 void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
                                          const MCSubtargetInfo &STI,
                                          raw_ostream &O,
@@ -1264,12 +1282,12 @@ void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
   const MCConstantExpr *BranchTarget =
       dyn_cast<MCConstantExpr>(MI->getOperand(OpNum).getExpr());
   int64_t Address;
-  if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+  if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
     O << "0x";
     O.write_hex(Address);
   } else {
     // Otherwise, just print the expression.
-    O << *MI->getOperand(OpNum).getExpr();
+    MI->getOperand(OpNum).getExpr()->print(O, &MAI);
   }
 }
 
@@ -1286,7 +1304,7 @@ void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
   }
 
   // Otherwise, just print the expression.
-  O << *MI->getOperand(OpNum).getExpr();
+  MI->getOperand(OpNum).getExpr()->print(O, &MAI);
 }
 
 void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
@@ -1298,10 +1316,10 @@ void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
   bool Valid;
   StringRef Name;
   if (Opcode == AArch64::ISB)
-    Name = AArch64ISB::ISBMapper().toString(Val, STI.getFeatureBits(), 
+    Name = AArch64ISB::ISBMapper().toString(Val, STI.getFeatureBits(),
                                             Valid);
   else
-    Name = AArch64DB::DBarrierMapper().toString(Val, STI.getFeatureBits(), 
+    Name = AArch64DB::DBarrierMapper().toString(Val, STI.getFeatureBits(),
                                                 Valid);
   if (Valid)
     O << Name;
@@ -1337,7 +1355,7 @@ void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,
   unsigned Val = MI->getOperand(OpNo).getImm();
 
   bool Valid;
-  StringRef Name = 
+  StringRef Name =
       AArch64PState::PStateMapper().toString(Val, STI.getFeatureBits(), Valid);
   if (Valid)
     O << StringRef(Name.str()).upper();
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index c2077a0fe557..15dee978e229 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -153,6 +153,10 @@ protected:
                               const MCSubtargetInfo &STI, raw_ostream &O);
   void printSIMDType10Operand(const MCInst *MI, unsigned OpNum,
                               const MCSubtargetInfo &STI, raw_ostream &O);
+  template<unsigned size>
+  void printGPRSeqPairsClassOperand(const MCInst *MI, unsigned OpNum,
+                                    const MCSubtargetInfo &STI,
+                                    raw_ostream &O);
 };
 
 class AArch64AppleInstPrinter : public AArch64InstPrinter {
diff --git a/lib/Target/AArch64/LLVMBuild.txt b/lib/Target/AArch64/LLVMBuild.txt
index 573fa10561cf..642c18394a67 100644
--- a/lib/Target/AArch64/LLVMBuild.txt
+++ b/lib/Target/AArch64/LLVMBuild.txt
@@ -31,5 +31,5 @@ has_jit = 1
 type = Library
 name = AArch64CodeGen
 parent = AArch64
-required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target
+required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target
 add_to_library_groups = AArch64
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 31fceb653a12..6c15bf3afb2d 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -252,7 +252,7 @@ bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   // We are properly aligned, so write NOPs as requested.
   Count /= 4;
   for (uint64_t i = 0; i != Count; ++i)
-    OW->Write32(0xd503201f);
+    OW->write32(0xd503201f);
   return true;
 }
 
@@ -496,7 +496,7 @@ void ELFAArch64AsmBackend::processFixupValue(
 // FIXME: Should be replaced with something more principled.
 static bool isByteSwappedFixup(const MCExpr *E) {
   MCValue Val;
-  if (!E->EvaluateAsRelocatable(Val, nullptr, nullptr))
+  if (!E->evaluateAsRelocatable(Val, nullptr, nullptr))
     return false;
 
   if (!Val.getSymA() || Val.getSymA()->getSymbol().isUndefined())
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 204a1abe72b5..78837de18b97 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -23,16 +23,14 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCELF.h"
 #include "llvm/MC/MCELFStreamer.h"
-#include "llvm/MC/MCELFSymbolFlags.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
@@ -161,18 +159,18 @@ private:
     MCSymbol *Start = getContext().createTempSymbol();
     EmitLabel(Start);
 
-    MCSymbol *Symbol = getContext().getOrCreateSymbol(
-        Name + "." + Twine(MappingSymbolCounter++));
+    auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
+        Name + "." + Twine(MappingSymbolCounter++)));
 
-    MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
-    MCELF::SetType(SD, ELF::STT_NOTYPE);
-    MCELF::SetBinding(SD, ELF::STB_LOCAL);
-    SD.setExternal(false);
+    getAssembler().registerSymbol(*Symbol);
+    Symbol->setType(ELF::STT_NOTYPE);
+    Symbol->setBinding(ELF::STB_LOCAL);
+    Symbol->setExternal(false);
     auto Sec = getCurrentSection().first;
     assert(Sec && "need a section");
     Symbol->setSection(*Sec);
 
-    const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
+    const MCExpr *Value = MCSymbolRefExpr::create(Start, getContext());
     Symbol->setVariableValue(Value);
   }
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index ab2cad6547fa..921c4b94a729 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -62,15 +62,14 @@ const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
   // version.
   MCContext &Context = Streamer.getContext();
   const MCExpr *Res =
-      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Context);
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, Context);
   MCSymbol *PCSym = Context.createTempSymbol();
   Streamer.EmitLabel(PCSym);
-  const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, Context);
-  return MCBinaryExpr::CreateSub(Res, PC, Context);
+  const MCExpr *PC = MCSymbolRefExpr::create(PCSym, Context);
+  return MCBinaryExpr::createSub(Res, PC, Context);
 }
 
-AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) {
-  Triple T(TT);
+AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
   if (T.getArch() == Triple::aarch64_be)
     IsLittleEndian = false;
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 9b88de7dabbc..253cd30f26ee 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -18,9 +18,10 @@
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
-class Target;
-class StringRef;
 class MCStreamer;
+class Target;
+class Triple;
+
 struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
   explicit AArch64MCAsmInfoDarwin();
   const MCExpr *
@@ -29,7 +30,7 @@ struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
 };
 
 struct AArch64MCAsmInfoELF : public MCAsmInfoELF {
-  explicit AArch64MCAsmInfoELF(StringRef TT);
+  explicit AArch64MCAsmInfoELF(const Triple &T);
 };
 
 } // namespace llvm
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index 277ea9fbace2..7d8e79bc63c8 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -174,16 +175,6 @@ public:
   unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
                    const MCSubtargetInfo &STI) const;
 
-  void EmitByte(unsigned char C, raw_ostream &OS) const { OS << (char)C; }
-
-  void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
-    // Output the constant in little endian byte order.
-    for (unsigned i = 0; i != Size; ++i) {
-      EmitByte(Val & 255, OS);
-      Val >>= 8;
-    }
-  }
-
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const override;
@@ -611,7 +602,7 @@ void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   }
 
   uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
-  EmitConstant(Binary, 4, OS);
+  support::endian::Writer<support::little>(OS).write<uint32_t>(Binary);
   ++MCNumEmitted; // Keep track of the # of mi's emitted.
 }
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 74b81af2cb4d..28703419514a 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -15,9 +15,8 @@
 #include "AArch64MCExpr.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCELF.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Object/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -26,7 +25,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "aarch64symbolrefexpr"
 
-const AArch64MCExpr *AArch64MCExpr::Create(const MCExpr *Expr, VariantKind Kind,
+const AArch64MCExpr *AArch64MCExpr::create(const MCExpr *Expr, VariantKind Kind,
                                        MCContext &Ctx) {
   return new (Ctx) AArch64MCExpr(Expr, Kind);
 }
@@ -76,24 +75,24 @@ StringRef AArch64MCExpr::getVariantKindName() const {
   }
 }
 
-void AArch64MCExpr::PrintImpl(raw_ostream &OS) const {
+void AArch64MCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   if (getKind() != VK_NONE)
     OS << getVariantKindName();
-  OS << *Expr;
+  Expr->print(OS, MAI);
 }
 
 void AArch64MCExpr::visitUsedExpr(MCStreamer &Streamer) const {
   Streamer.visitUsedExpr(*getSubExpr());
 }
 
-MCSection *AArch64MCExpr::FindAssociatedSection() const {
+MCSection *AArch64MCExpr::findAssociatedSection() const {
   llvm_unreachable("FIXME: what goes here?");
 }
 
-bool AArch64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+bool AArch64MCExpr::evaluateAsRelocatableImpl(MCValue &Res,
                                             const MCAsmLayout *Layout,
 					    const MCFixup *Fixup) const {
-  if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout, Fixup))
+  if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup))
     return false;
 
   Res =
@@ -121,8 +120,7 @@ static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
     // We're known to be under a TLS fixup, so any symbol should be
     // modified. There should be only one.
     const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
-    MCSymbolData &SD = Asm.getOrCreateSymbolData(SymRef.getSymbol());
-    MCELF::SetType(SD, ELF::STT_TLS);
+    cast<MCSymbolELF>(SymRef.getSymbol()).setType(ELF::STT_TLS);
     break;
   }
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index 95d22775736c..1165314e4105 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -112,7 +112,7 @@ public:
   /// @name Construction
   /// @{
 
-  static const AArch64MCExpr *Create(const MCExpr *Expr, VariantKind Kind,
+  static const AArch64MCExpr *create(const MCExpr *Expr, VariantKind Kind,
                                    MCContext &Ctx);
 
   /// @}
@@ -145,13 +145,13 @@ public:
   /// (e.g. ":got:", ":lo12:").
   StringRef getVariantKindName() const;
 
-  void PrintImpl(raw_ostream &OS) const override;
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
 
   void visitUsedExpr(MCStreamer &Streamer) const override;
 
-  MCSection *FindAssociatedSection() const override;
+  MCSection *findAssociatedSection() const override;
 
-  bool EvaluateAsRelocatableImpl(MCValue &Res,
+  bool evaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAsmLayout *Layout,
 				 const MCFixup *Fixup) const override;
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 2e22de08537b..f89a85273872 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -58,15 +58,13 @@ static MCRegisterInfo *createAArch64MCRegisterInfo(StringRef Triple) {
 }
 
 static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
-                                         StringRef TT) {
-  Triple TheTriple(TT);
-
+                                         const Triple &TheTriple) {
   MCAsmInfo *MAI;
   if (TheTriple.isOSDarwin())
     MAI = new AArch64MCAsmInfoDarwin();
   else {
     assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
-    MAI = new AArch64MCAsmInfoELF(TT);
+    MAI = new AArch64MCAsmInfoELF(TheTriple);
   }
 
   // Initial state of the frame pointer is SP.
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index d425975e7cb0..67af810bbbec 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -31,10 +31,9 @@ class AArch64MachObjectWriter : public MCMachObjectTargetWriter {
 
 public:
   AArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype)
-      : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype,
-                                 /*UseAggressiveSymbolFolding=*/true) {}
+      : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype) {}
 
-  void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+  void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
                         const MCAsmLayout &Layout, const MCFragment *Fragment,
                         const MCFixup &Fixup, MCValue Target,
                         uint64_t &FixedValue) override;
@@ -140,7 +139,7 @@ static bool canUseLocalRelocation(const MCSectionMachO &Section,
   return false;
 }
 
-void AArch64MachObjectWriter::RecordRelocation(
+void AArch64MachObjectWriter::recordRelocation(
     MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout,
     const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
     uint64_t &FixedValue) {
@@ -209,11 +208,9 @@ void AArch64MachObjectWriter::RecordRelocation(
     }
   } else if (Target.getSymB()) { // A - B + constant
     const MCSymbol *A = &Target.getSymA()->getSymbol();
-    const MCSymbolData &A_SD = Asm.getSymbolData(*A);
     const MCSymbol *A_Base = Asm.getAtom(*A);
 
     const MCSymbol *B = &Target.getSymB()->getSymbol();
-    const MCSymbolData &B_SD = Asm.getSymbolData(*B);
     const MCSymbol *B_Base = Asm.getAtom(*B);
 
     // Check for "_foo@got - .", which comes through here as:
@@ -264,14 +261,12 @@ void AArch64MachObjectWriter::RecordRelocation(
       Asm.getContext().reportFatalError(Fixup.getLoc(),
                                   "unsupported relocation with identical base");
 
-    Value += (!A_SD.getFragment() ? 0 : Writer->getSymbolAddress(*A, Layout)) -
-             (!A_Base || !A_Base->getData().getFragment()
-                  ? 0
-                  : Writer->getSymbolAddress(*A_Base, Layout));
-    Value -= (!B_SD.getFragment() ? 0 : Writer->getSymbolAddress(*B, Layout)) -
-             (!B_Base || !B_Base->getData().getFragment()
-                  ? 0
-                  : Writer->getSymbolAddress(*B_Base, Layout));
+    Value += (!A->getFragment() ? 0 : Writer->getSymbolAddress(*A, Layout)) -
+             (!A_Base || !A_Base->getFragment() ? 0 : Writer->getSymbolAddress(
+                                                          *A_Base, Layout));
+    Value -= (!B->getFragment() ? 0 : Writer->getSymbolAddress(*B, Layout)) -
+             (!B_Base || !B_Base->getFragment() ? 0 : Writer->getSymbolAddress(
+                                                          *B_Base, Layout));
 
     Type = MachO::ARM64_RELOC_UNSIGNED;
 
@@ -304,7 +299,7 @@ void AArch64MachObjectWriter::RecordRelocation(
       // If the evaluation is an absolute value, just use that directly
       // to keep things easy.
       int64_t Res;
-      if (Symbol->getVariableValue()->EvaluateAsAbsolute(
+      if (Symbol->getVariableValue()->evaluateAsAbsolute(
               Res, Layout, Writer->getSectionAddressMap())) {
         FixedValue = Res;
         return;
@@ -313,12 +308,12 @@ void AArch64MachObjectWriter::RecordRelocation(
       // FIXME: Will the Target we already have ever have any data in it
       // we need to preserve and merge with the new Target? How about
       // the FixedValue?
-      if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout,
+      if (!Symbol->getVariableValue()->evaluateAsRelocatable(Target, &Layout,
                                                              &Fixup))
         Asm.getContext().reportFatalError(Fixup.getLoc(),
                                     "unable to resolve variable '" +
                                         Symbol->getName() + "'");
-      return RecordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+      return recordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
                               FixedValue);
     }
 
@@ -360,7 +355,7 @@ void AArch64MachObjectWriter::RecordRelocation(
       // Resolve constant variables.
       if (Symbol->isVariable()) {
         int64_t Res;
-        if (Symbol->getVariableValue()->EvaluateAsAbsolute(
+        if (Symbol->getVariableValue()->evaluateAsAbsolute(
                 Res, Layout, Writer->getSectionAddressMap())) {
           FixedValue = Res;
           return;
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index 28b8e7e29fe2..ee85b65bf39a 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -175,6 +175,7 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = {
   {"id_mmfr1_el1", ID_MMFR1_EL1, {}},
   {"id_mmfr2_el1", ID_MMFR2_EL1, {}},
   {"id_mmfr3_el1", ID_MMFR3_EL1, {}},
+  {"id_mmfr4_el1", ID_MMFR4_EL1, {}},
   {"id_isar0_el1", ID_ISAR0_EL1, {}},
   {"id_isar1_el1", ID_ISAR1_EL1, {}},
   {"id_isar2_el1", ID_ISAR2_EL1, {}},
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 7125f14f1a2d..7e42f8e3601e 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -603,6 +603,7 @@ namespace AArch64SysReg {
     ISR_EL1           = 0xc608, // 11  000  1100  0001  000
     CNTPCT_EL0        = 0xdf01, // 11  011  1110  0000  001
     CNTVCT_EL0        = 0xdf02,  // 11  011  1110  0000  010
+    ID_MMFR4_EL1      = 0xc016,  // 11  000  0000  0010  110
 
     // Trace registers
     TRCSTATR          = 0x8818, // 10  001  0000  0011  000
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index d3cc068993e0..9550a3a3cad1 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -16,11 +16,13 @@
 #define LLVM_LIB_TARGET_ARM_ARM_H
 
 #include "llvm/Support/CodeGen.h"
+#include <functional>
 
 namespace llvm {
 
 class ARMAsmPrinter;
 class ARMBaseTargetMachine;
+class Function;
 class FunctionPass;
 class ImmutablePass;
 class MachineInstr;
@@ -38,7 +40,8 @@ FunctionPass *createARMConstantIslandPass();
 FunctionPass *createMLxExpansionPass();
 FunctionPass *createThumb2ITBlockPass();
 FunctionPass *createARMOptimizeBarriersPass();
-FunctionPass *createThumb2SizeReductionPass();
+FunctionPass *createThumb2SizeReductionPass(
+    std::function<bool(const Function &)> Ftor = nullptr);
 
 void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                   ARMAsmPrinter &AP);
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 04503b89de73..d84f2961d810 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -87,7 +87,7 @@ void ARMAsmPrinter::EmitXXStructor(const Constant *CV) {
   const GlobalValue *GV = dyn_cast<GlobalValue>(CV->stripPointerCasts());
   assert(GV && "C++ constructor pointer was not a GlobalValue!");
 
-  const MCExpr *E = MCSymbolRefExpr::Create(GetARMGVSymbol(GV,
+  const MCExpr *E = MCSymbolRefExpr::create(GetARMGVSymbol(GV,
                                                            ARMII::MO_NO_FLAG),
                                             (Subtarget->isTargetELF()
                                              ? MCSymbolRefExpr::VK_ARM_TARGET1
@@ -173,7 +173,7 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     break;
   }
   case MachineOperand::MO_MachineBasicBlock:
-    O << *MO.getMBB()->getSymbol();
+    MO.getMBB()->getSymbol()->print(O, MAI);
     return;
   case MachineOperand::MO_GlobalAddress: {
     const GlobalValue *GV = MO.getGlobal();
@@ -181,7 +181,7 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
       O << ":lower16:";
     else if (TF & ARMII::MO_HI16)
       O << ":upper16:";
-    O << *GetARMGVSymbol(GV, TF);
+    GetARMGVSymbol(GV, TF)->print(O, MAI);
 
     printOffset(MO.getOffset(), O);
     if (TF == ARMII::MO_PLT)
@@ -189,7 +189,7 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     break;
   }
   case MachineOperand::MO_ConstantPoolIndex:
-    O << *GetCPISymbol(MO.getIndex());
+    GetCPISymbol(MO.getIndex())->print(O, MAI);
     break;
   }
 }
@@ -467,7 +467,7 @@ emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
     // using NLPs; however, sometimes the types are local to the file.
     // We need to fill in the value for the NLP in those cases.
     OutStreamer.EmitValue(
-        MCSymbolRefExpr::Create(MCSym.getPointer(), OutStreamer.getContext()),
+        MCSymbolRefExpr::create(MCSym.getPointer(), OutStreamer.getContext()),
         4 /*size*/);
 }
 
@@ -640,9 +640,13 @@ void ARMAsmPrinter::emitAttributes() {
     if (STI.hasFPARMv8())
       // FPv5 and FP-ARMv8 have the same instructions, so are modeled as one
       // FPU, but there are two different names for it depending on the CPU.
-      ATS.emitFPU(STI.hasD16() ? ARM::FK_FPV5_D16 : ARM::FK_FP_ARMV8);
+      ATS.emitFPU(STI.hasD16()
+                  ? (STI.isFPOnlySP() ? ARM::FK_FPV5_SP_D16 : ARM::FK_FPV5_D16)
+                  : ARM::FK_FP_ARMV8);
     else if (STI.hasVFP4())
-      ATS.emitFPU(STI.hasD16() ? ARM::FK_VFPV4_D16 : ARM::FK_VFPV4);
+      ATS.emitFPU(STI.hasD16()
+                  ? (STI.isFPOnlySP() ? ARM::FK_FPV4_SP_D16 : ARM::FK_VFPV4_D16)
+                  : ARM::FK_VFPV4);
     else if (STI.hasVFP3())
       ATS.emitFPU(STI.hasD16() ? ARM::FK_VFPV3_D16 : ARM::FK_VFPV3);
     else if (STI.hasVFP2())
@@ -895,7 +899,7 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
 
   // Create an MCSymbol for the reference.
   const MCExpr *Expr =
-    MCSymbolRefExpr::Create(MCSym, getModifierVariantKind(ACPV->getModifier()),
+    MCSymbolRefExpr::create(MCSym, getModifierVariantKind(ACPV->getModifier()),
                             OutContext);
 
   if (ACPV->getPCAdjustment()) {
@@ -903,10 +907,10 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
                                     getFunctionNumber(),
                                     ACPV->getLabelId(),
                                     OutContext);
-    const MCExpr *PCRelExpr = MCSymbolRefExpr::Create(PCLabel, OutContext);
+    const MCExpr *PCRelExpr = MCSymbolRefExpr::create(PCLabel, OutContext);
     PCRelExpr =
-      MCBinaryExpr::CreateAdd(PCRelExpr,
-                              MCConstantExpr::Create(ACPV->getPCAdjustment(),
+      MCBinaryExpr::createAdd(PCRelExpr,
+                              MCConstantExpr::create(ACPV->getPCAdjustment(),
                                                      OutContext),
                               OutContext);
     if (ACPV->mustAddCurrentAddress()) {
@@ -914,25 +918,22 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
       // label, so just emit a local label end reference that instead.
       MCSymbol *DotSym = OutContext.createTempSymbol();
       OutStreamer->EmitLabel(DotSym);
-      const MCExpr *DotExpr = MCSymbolRefExpr::Create(DotSym, OutContext);
-      PCRelExpr = MCBinaryExpr::CreateSub(PCRelExpr, DotExpr, OutContext);
+      const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
+      PCRelExpr = MCBinaryExpr::createSub(PCRelExpr, DotExpr, OutContext);
     }
-    Expr = MCBinaryExpr::CreateSub(Expr, PCRelExpr, OutContext);
+    Expr = MCBinaryExpr::createSub(Expr, PCRelExpr, OutContext);
   }
   OutStreamer->EmitValue(Expr, Size);
 }
 
-void ARMAsmPrinter::EmitJumpTable(const MachineInstr *MI) {
-  unsigned Opcode = MI->getOpcode();
-  int OpNum = 1;
-  if (Opcode == ARM::BR_JTadd)
-    OpNum = 2;
-  else if (Opcode == ARM::BR_JTm)
-    OpNum = 3;
-
-  const MachineOperand &MO1 = MI->getOperand(OpNum);
+void ARMAsmPrinter::EmitJumpTableAddrs(const MachineInstr *MI) {
+  const MachineOperand &MO1 = MI->getOperand(1);
   unsigned JTI = MO1.getIndex();
 
+  // Make sure the Thumb jump table is 4-byte aligned. This will be a nop for
+  // ARM mode tables.
+  EmitAlignment(2);
+
   // Emit a label for the jump table.
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
   OutStreamer->EmitLabel(JTISymbol);
@@ -955,16 +956,16 @@ void ARMAsmPrinter::EmitJumpTable(const MachineInstr *MI) {
     // LJTI_0_0:
     //    .word (LBB0 - LJTI_0_0)
     //    .word (LBB1 - LJTI_0_0)
-    const MCExpr *Expr = MCSymbolRefExpr::Create(MBB->getSymbol(), OutContext);
+    const MCExpr *Expr = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
 
     if (TM.getRelocationModel() == Reloc::PIC_)
-      Expr = MCBinaryExpr::CreateSub(Expr, MCSymbolRefExpr::Create(JTISymbol,
+      Expr = MCBinaryExpr::createSub(Expr, MCSymbolRefExpr::create(JTISymbol,
                                                                    OutContext),
                                      OutContext);
     // If we're generating a table of Thumb addresses in static relocation
     // model, we need to add one to keep interworking correctly.
     else if (AFI->isThumbFunction())
-      Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(1,OutContext),
+      Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(1,OutContext),
                                      OutContext);
     OutStreamer->EmitValue(Expr, 4);
   }
@@ -972,10 +973,8 @@ void ARMAsmPrinter::EmitJumpTable(const MachineInstr *MI) {
   OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
 }
 
-void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
-  unsigned Opcode = MI->getOpcode();
-  int OpNum = (Opcode == ARM::t2BR_JT) ? 2 : 1;
-  const MachineOperand &MO1 = MI->getOperand(OpNum);
+void ARMAsmPrinter::EmitJumpTableInsts(const MachineInstr *MI) {
+  const MachineOperand &MO1 = MI->getOperand(1);
   unsigned JTI = MO1.getIndex();
 
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
@@ -985,51 +984,67 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
   const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
-  unsigned OffsetWidth = 4;
-  if (MI->getOpcode() == ARM::t2TBB_JT) {
-    OffsetWidth = 1;
-    // Mark the jump table as data-in-code.
-    OutStreamer->EmitDataRegion(MCDR_DataRegionJT8);
-  } else if (MI->getOpcode() == ARM::t2TBH_JT) {
-    OffsetWidth = 2;
-    // Mark the jump table as data-in-code.
-    OutStreamer->EmitDataRegion(MCDR_DataRegionJT16);
-  }
 
   for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
     MachineBasicBlock *MBB = JTBBs[i];
-    const MCExpr *MBBSymbolExpr = MCSymbolRefExpr::Create(MBB->getSymbol(),
+    const MCExpr *MBBSymbolExpr = MCSymbolRefExpr::create(MBB->getSymbol(),
                                                           OutContext);
     // If this isn't a TBB or TBH, the entries are direct branch instructions.
-    if (OffsetWidth == 4) {
-      EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2B)
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2B)
         .addExpr(MBBSymbolExpr)
         .addImm(ARMCC::AL)
         .addReg(0));
-      continue;
-    }
+  }
+}
+
+void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI,
+                                        unsigned OffsetWidth) {
+  assert((OffsetWidth == 1 || OffsetWidth == 2) && "invalid tbb/tbh width");
+  const MachineOperand &MO1 = MI->getOperand(1);
+  unsigned JTI = MO1.getIndex();
+
+  MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
+  OutStreamer->EmitLabel(JTISymbol);
+
+  // Emit each entry of the table.
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+
+  // Mark the jump table as data-in-code.
+  OutStreamer->EmitDataRegion(OffsetWidth == 1 ? MCDR_DataRegionJT8
+                                               : MCDR_DataRegionJT16);
+
+  for (auto MBB : JTBBs) {
+    const MCExpr *MBBSymbolExpr = MCSymbolRefExpr::create(MBB->getSymbol(),
+                                                          OutContext);
     // Otherwise it's an offset from the dispatch instruction. Construct an
     // MCExpr for the entry. We want a value of the form:
-    // (BasicBlockAddr - TableBeginAddr) / 2
+    // (BasicBlockAddr - TBBInstAddr + 4) / 2
     //
     // For example, a TBB table with entries jumping to basic blocks BB0 and BB1
     // would look like:
     // LJTI_0_0:
-    //    .byte (LBB0 - LJTI_0_0) / 2
-    //    .byte (LBB1 - LJTI_0_0) / 2
-    const MCExpr *Expr =
-      MCBinaryExpr::CreateSub(MBBSymbolExpr,
-                              MCSymbolRefExpr::Create(JTISymbol, OutContext),
-                              OutContext);
-    Expr = MCBinaryExpr::CreateDiv(Expr, MCConstantExpr::Create(2, OutContext),
+    //    .byte (LBB0 - (LCPI0_0 + 4)) / 2
+    //    .byte (LBB1 - (LCPI0_0 + 4)) / 2
+    // where LCPI0_0 is a label defined just before the TBB instruction using
+    // this table.
+    MCSymbol *TBInstPC = GetCPISymbol(MI->getOperand(0).getImm());
+    const MCExpr *Expr = MCBinaryExpr::createAdd(
+        MCSymbolRefExpr::create(TBInstPC, OutContext),
+        MCConstantExpr::create(4, OutContext), OutContext);
+    Expr = MCBinaryExpr::createSub(MBBSymbolExpr, Expr, OutContext);
+    Expr = MCBinaryExpr::createDiv(Expr, MCConstantExpr::create(2, OutContext),
                                    OutContext);
     OutStreamer->EmitValue(Expr, OffsetWidth);
   }
   // Mark the end of jump table data-in-code region. 32-bit offsets use
   // actual branch instructions here, so we don't mark those as a data-region
   // at all.
-  if (OffsetWidth != 4)
-    OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+  OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+
+  // Make sure the next instruction is 2-byte aligned.
+  EmitAlignment(1);
 }
 
 void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
@@ -1212,7 +1227,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
                   : (MI->getOpcode() == ARM::tLEApcrel ? ARM::tADR
                      : ARM::ADR))
       .addReg(MI->getOperand(0).getReg())
-      .addExpr(MCSymbolRefExpr::Create(CPISymbol, OutContext))
+      .addExpr(MCSymbolRefExpr::create(CPISymbol, OutContext))
       // Add predicate operands.
       .addImm(MI->getOperand(2).getImm())
       .addReg(MI->getOperand(3).getReg()));
@@ -1228,7 +1243,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
                   : (MI->getOpcode() == ARM::tLEApcrelJT ? ARM::tADR
                      : ARM::ADR))
       .addReg(MI->getOperand(0).getReg())
-      .addExpr(MCSymbolRefExpr::Create(JTIPICSymbol, OutContext))
+      .addExpr(MCSymbolRefExpr::create(JTIPICSymbol, OutContext))
       // Add predicate operands.
       .addImm(MI->getOperand(2).getImm())
       .addReg(MI->getOperand(3).getReg()));
@@ -1278,7 +1293,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBL)
         // Predicate comes first here.
         .addImm(ARMCC::AL).addReg(0)
-        .addExpr(MCSymbolRefExpr::Create(TRegSym, OutContext)));
+        .addExpr(MCSymbolRefExpr::create(TRegSym, OutContext)));
     return;
   }
   case ARM::BMOVPCRX_CALL: {
@@ -1315,7 +1330,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const GlobalValue *GV = Op.getGlobal();
     const unsigned TF = Op.getTargetFlags();
     MCSymbol *GVSym = GetARMGVSymbol(GV, TF);
-    const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
+    const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext);
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::Bcc)
       .addExpr(GVSymExpr)
       // Add predicate operands.
@@ -1332,17 +1347,17 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     unsigned TF = MI->getOperand(1).getTargetFlags();
     const GlobalValue *GV = MI->getOperand(1).getGlobal();
     MCSymbol *GVSym = GetARMGVSymbol(GV, TF);
-    const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
+    const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext);
 
     MCSymbol *LabelSym = getPICLabel(DL->getPrivateGlobalPrefix(),
                                      getFunctionNumber(),
                                      MI->getOperand(2).getImm(), OutContext);
-    const MCExpr *LabelSymExpr= MCSymbolRefExpr::Create(LabelSym, OutContext);
+    const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext);
     unsigned PCAdj = (Opc == ARM::MOVi16_ga_pcrel) ? 8 : 4;
     const MCExpr *PCRelExpr =
-      ARMMCExpr::CreateLower16(MCBinaryExpr::CreateSub(GVSymExpr,
-                                      MCBinaryExpr::CreateAdd(LabelSymExpr,
-                                      MCConstantExpr::Create(PCAdj, OutContext),
+      ARMMCExpr::createLower16(MCBinaryExpr::createSub(GVSymExpr,
+                                      MCBinaryExpr::createAdd(LabelSymExpr,
+                                      MCConstantExpr::create(PCAdj, OutContext),
                                       OutContext), OutContext), OutContext);
       TmpInst.addOperand(MCOperand::createExpr(PCRelExpr));
 
@@ -1365,17 +1380,17 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     unsigned TF = MI->getOperand(2).getTargetFlags();
     const GlobalValue *GV = MI->getOperand(2).getGlobal();
     MCSymbol *GVSym = GetARMGVSymbol(GV, TF);
-    const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
+    const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext);
 
     MCSymbol *LabelSym = getPICLabel(DL->getPrivateGlobalPrefix(),
                                      getFunctionNumber(),
                                      MI->getOperand(3).getImm(), OutContext);
-    const MCExpr *LabelSymExpr= MCSymbolRefExpr::Create(LabelSym, OutContext);
+    const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext);
     unsigned PCAdj = (Opc == ARM::MOVTi16_ga_pcrel) ? 8 : 4;
     const MCExpr *PCRelExpr =
-        ARMMCExpr::CreateUpper16(MCBinaryExpr::CreateSub(GVSymExpr,
-                                   MCBinaryExpr::CreateAdd(LabelSymExpr,
-                                      MCConstantExpr::Create(PCAdj, OutContext),
+        ARMMCExpr::createUpper16(MCBinaryExpr::createSub(GVSymExpr,
+                                   MCBinaryExpr::createAdd(LabelSymExpr,
+                                      MCConstantExpr::create(PCAdj, OutContext),
                                           OutContext), OutContext), OutContext);
       TmpInst.addOperand(MCOperand::createExpr(PCRelExpr));
     // Add predicate operands.
@@ -1501,6 +1516,16 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       EmitGlobalConstant(MCPE.Val.ConstVal);
     return;
   }
+  case ARM::JUMPTABLE_ADDRS:
+    EmitJumpTableAddrs(MI);
+    return;
+  case ARM::JUMPTABLE_INSTS:
+    EmitJumpTableInsts(MI);
+    return;
+  case ARM::JUMPTABLE_TBB:
+  case ARM::JUMPTABLE_TBH:
+    EmitJumpTableTBInst(MI, MI->getOpcode() == ARM::JUMPTABLE_TBB ? 1 : 2);
+    return;
   case ARM::t2BR_JT: {
     // Lower and emit the instruction itself, then the jump table following it.
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr)
@@ -1509,37 +1534,19 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       // Add predicate operands.
       .addImm(ARMCC::AL)
       .addReg(0));
-
-    // Output the data for the jump table itself
-    EmitJump2Table(MI);
-    return;
-  }
-  case ARM::t2TBB_JT: {
-    // Lower and emit the instruction itself, then the jump table following it.
-    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2TBB)
-      .addReg(ARM::PC)
-      .addReg(MI->getOperand(0).getReg())
-      // Add predicate operands.
-      .addImm(ARMCC::AL)
-      .addReg(0));
-
-    // Output the data for the jump table itself
-    EmitJump2Table(MI);
-    // Make sure the next instruction is 2-byte aligned.
-    EmitAlignment(1);
     return;
   }
+  case ARM::t2TBB_JT:
   case ARM::t2TBH_JT: {
-    // Lower and emit the instruction itself, then the jump table following it.
-    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2TBH)
-      .addReg(ARM::PC)
-      .addReg(MI->getOperand(0).getReg())
-      // Add predicate operands.
-      .addImm(ARMCC::AL)
-      .addReg(0));
-
-    // Output the data for the jump table itself
-    EmitJump2Table(MI);
+    unsigned Opc = MI->getOpcode() == ARM::t2TBB_JT ? ARM::t2TBB : ARM::t2TBH;
+    // Lower and emit the PC label, then the instruction itself.
+    OutStreamer->EmitLabel(GetCPISymbol(MI->getOperand(3).getImm()));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(Opc)
+                                     .addReg(MI->getOperand(0).getReg())
+                                     .addReg(MI->getOperand(1).getReg())
+                                     // Add predicate operands.
+                                     .addImm(ARMCC::AL)
+                                     .addReg(0));
     return;
   }
   case ARM::tBR_JTr:
@@ -1559,13 +1566,6 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     if (Opc == ARM::MOVr)
       TmpInst.addOperand(MCOperand::createReg(0));
     EmitToStreamer(*OutStreamer, TmpInst);
-
-    // Make sure the Thumb jump table is 4-byte aligned.
-    if (Opc == ARM::tMOVr)
-      EmitAlignment(2);
-
-    // Output the data for the jump table itself
-    EmitJumpTable(MI);
     return;
   }
   case ARM::BR_JTm: {
@@ -1589,9 +1589,6 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     TmpInst.addOperand(MCOperand::createImm(ARMCC::AL));
     TmpInst.addOperand(MCOperand::createReg(0));
     EmitToStreamer(*OutStreamer, TmpInst);
-
-    // Output the data for the jump table itself
-    EmitJumpTable(MI);
     return;
   }
   case ARM::BR_JTadd: {
@@ -1606,9 +1603,6 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addReg(0)
       // Add 's' bit operand (always reg0 for this)
       .addReg(0));
-
-    // Output the data for the jump table itself
-    EmitJumpTable(MI);
     return;
   }
   case ARM::SPACE:
@@ -1695,7 +1689,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addImm(ARMCC::AL)
       .addReg(0));
 
-    const MCExpr *SymbolExpr = MCSymbolRefExpr::Create(Label, OutContext);
+    const MCExpr *SymbolExpr = MCSymbolRefExpr::create(Label, OutContext);
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tB)
       .addExpr(SymbolExpr)
       .addImm(ARMCC::AL)
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index 7bfb9447818e..a6bc3683c8b9 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -71,8 +71,9 @@ public:
   void emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
                         const MCSubtargetInfo *EndInfo) const override;
 
-  void EmitJumpTable(const MachineInstr *MI);
-  void EmitJump2Table(const MachineInstr *MI);
+  void EmitJumpTableAddrs(const MachineInstr *MI);
+  void EmitJumpTableInsts(const MachineInstr *MI);
+  void EmitJumpTableTBInst(const MachineInstr *MI, unsigned OffsetWidth);
   void EmitInstruction(const MachineInstr *MI) override;
   bool runOnMachineFunction(MachineFunction &F) override;
 
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index c5d6b258240a..9c4b4961fe8c 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -627,6 +627,10 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   case ARM::t2MOVi32imm:
     return 8;
   case ARM::CONSTPOOL_ENTRY:
+  case ARM::JUMPTABLE_INSTS:
+  case ARM::JUMPTABLE_ADDRS:
+  case ARM::JUMPTABLE_TBB:
+  case ARM::JUMPTABLE_TBH:
     // If this machine instr is a constant pool entry, its size is recorded as
     // operand #2.
     return MI->getOperand(2).getImm();
@@ -641,42 +645,6 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   case ARM::t2Int_eh_sjlj_setjmp:
   case ARM::t2Int_eh_sjlj_setjmp_nofp:
     return 12;
-  case ARM::BR_JTr:
-  case ARM::BR_JTm:
-  case ARM::BR_JTadd:
-  case ARM::tBR_JTr:
-  case ARM::t2BR_JT:
-  case ARM::t2TBB_JT:
-  case ARM::t2TBH_JT: {
-    // These are jumptable branches, i.e. a branch followed by an inlined
-    // jumptable. The size is 4 + 4 * number of entries. For TBB, each
-    // entry is one byte; TBH two byte each.
-    unsigned EntrySize = (Opc == ARM::t2TBB_JT)
-      ? 1 : ((Opc == ARM::t2TBH_JT) ? 2 : 4);
-    unsigned NumOps = MCID.getNumOperands();
-    MachineOperand JTOP =
-      MI->getOperand(NumOps - (MI->isPredicable() ? 2 : 1));
-    unsigned JTI = JTOP.getIndex();
-    const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
-    assert(MJTI != nullptr);
-    const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
-    assert(JTI < JT.size());
-    // Thumb instructions are 2 byte aligned, but JT entries are 4 byte
-    // 4 aligned. The assembler / linker may add 2 byte padding just before
-    // the JT entries.  The size does not include this padding; the
-    // constant islands pass does separate bookkeeping for it.
-    // FIXME: If we know the size of the function is less than (1 << 16) *2
-    // bytes, we can use 16-bit entries instead. Then there won't be an
-    // alignment issue.
-    unsigned InstSize = (Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT) ? 2 : 4;
-    unsigned NumEntries = JT[JTI].MBBs.size();
-    if (Opc == ARM::t2TBB_JT && (NumEntries & 1))
-      // Make sure the instruction that follows TBB is 2-byte aligned.
-      // FIXME: Constant island pass should insert an "ALIGN" instruction
-      // instead.
-      ++NumEntries;
-    return NumEntries * EntrySize + InstSize;
-  }
   case ARM::SPACE:
     return MI->getOperand(1).getImm();
   }
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 6fa5ad7d0522..f4ec8c67c977 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -180,9 +180,7 @@ namespace {
       MachineInstr *MI;
       MachineInstr *CPEMI;
       MachineBasicBlock *HighWaterMark;
-    private:
       unsigned MaxDisp;
-    public:
       bool NegOk;
       bool IsSoImm;
       bool KnownAlignment;
@@ -216,12 +214,24 @@ namespace {
     };
 
     /// CPEntries - Keep track of all of the constant pool entry machine
-    /// instructions. For each original constpool index (i.e. those that
-    /// existed upon entry to this pass), it keeps a vector of entries.
-    /// Original elements are cloned as we go along; the clones are
-    /// put in the vector of the original element, but have distinct CPIs.
+    /// instructions. For each original constpool index (i.e. those that existed
+    /// upon entry to this pass), it keeps a vector of entries.  Original
+    /// elements are cloned as we go along; the clones are put in the vector of
+    /// the original element, but have distinct CPIs.
+    ///
+    /// The first half of CPEntries contains generic constants, the second half
+    /// contains jump tables. Use getCombinedIndex on a generic CPEMI to look up
+    /// which vector it will be in here.
     std::vector<std::vector<CPEntry> > CPEntries;
 
+    /// Maps a JT index to the offset in CPEntries containing copies of that
+    /// table. The equivalent map for a CONSTPOOL_ENTRY is the identity.
+    DenseMap<int, int> JumpTableEntryIndices;
+
+    /// Maps a JT index to the LEA that actually uses the index to calculate its
+    /// base address.
+    DenseMap<int, int> JumpTableUserIndices;
+
     /// ImmBranch - One per immediate branch, keeping the machine instruction
     /// pointer, conditional or unconditional, the max displacement,
     /// and (if isCond is true) the corresponding unconditional branch
@@ -269,7 +279,8 @@ namespace {
     }
 
   private:
-    void doInitialPlacement(std::vector<MachineInstr*> &CPEMIs);
+    void doInitialConstPlacement(std::vector<MachineInstr *> &CPEMIs);
+    void doInitialJumpTablePlacement(std::vector<MachineInstr *> &CPEMIs);
     bool BBHasFallthrough(MachineBasicBlock *MBB);
     CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
     unsigned getCPELogAlign(const MachineInstr *CPEMI);
@@ -279,6 +290,7 @@ namespace {
     void updateForInsertedWaterBlock(MachineBasicBlock *NewBB);
     void adjustBBOffsetsAfter(MachineBasicBlock *BB);
     bool decrementCPEReferenceCount(unsigned CPI, MachineInstr* CPEMI);
+    unsigned getCombinedIndex(const MachineInstr *CPEMI);
     int findInRangeCPEntry(CPUser& U, unsigned UserOffset);
     bool findAvailableWater(CPUser&U, unsigned UserOffset,
                             water_iterator &WaterIter);
@@ -301,8 +313,9 @@ namespace {
     bool optimizeThumb2Instructions();
     bool optimizeThumb2Branches();
     bool reorderThumb2JumpTables();
-    unsigned removeDeadDefinitions(MachineInstr *MI, unsigned BaseReg,
-                                   unsigned IdxReg);
+    bool preserveBaseRegister(MachineInstr *JumpMI, MachineInstr *LEAMI,
+                              unsigned &DeadSize, bool &CanDeleteLEA,
+                              bool &BaseRegKill);
     bool optimizeThumb2JumpTables();
     MachineBasicBlock *adjustJTTargetBlockForward(MachineBasicBlock *BB,
                                                   MachineBasicBlock *JTBB);
@@ -413,7 +426,10 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   // we put them all at the end of the function.
   std::vector<MachineInstr*> CPEMIs;
   if (!MCP->isEmpty())
-    doInitialPlacement(CPEMIs);
+    doInitialConstPlacement(CPEMIs);
+
+  if (MF->getJumpTableInfo())
+    doInitialJumpTablePlacement(CPEMIs);
 
   /// The next UID to take is the first unused one.
   AFI->initPICLabelUId(CPEMIs.size());
@@ -478,7 +494,8 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
     for (unsigned j = 0, je = CPEntries[i].size(); j != je; ++j) {
       const CPEntry & CPE = CPEntries[i][j];
-      AFI->recordCPEClone(i, CPE.CPI);
+      if (CPE.CPEMI && CPE.CPEMI->getOperand(1).isCPI())
+        AFI->recordCPEClone(i, CPE.CPI);
     }
   }
 
@@ -488,6 +505,8 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   WaterList.clear();
   CPUsers.clear();
   CPEntries.clear();
+  JumpTableEntryIndices.clear();
+  JumpTableUserIndices.clear();
   ImmBranches.clear();
   PushPopMIs.clear();
   T2JumpTables.clear();
@@ -495,10 +514,10 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   return MadeChange;
 }
 
-/// doInitialPlacement - Perform the initial placement of the constant pool
-/// entries.  To start with, we put them all at the end of the function.
+/// \brief Perform the initial placement of the regular constant pool entries.
+/// To start with, we put them all at the end of the function.
 void
-ARMConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
+ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs) {
   // Create the basic block to hold the CPE's.
   MachineBasicBlock *BB = MF->CreateMachineBasicBlock();
   MF->push_back(BB);
@@ -556,6 +575,66 @@ ARMConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
   DEBUG(BB->dump());
 }
 
+/// \brief Do initial placement of the jump tables. Because Thumb2's TBB and TBH
+/// instructions can be made more efficient if the jump table immediately
+/// follows the instruction, it's best to place them immediately next to their
+/// jumps to begin with. In almost all cases they'll never be moved from that
+/// position.
+void ARMConstantIslands::doInitialJumpTablePlacement(
+    std::vector<MachineInstr *> &CPEMIs) {
+  unsigned i = CPEntries.size();
+  auto MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+
+  MachineBasicBlock *LastCorrectlyNumberedBB = nullptr;
+  for (MachineBasicBlock &MBB : *MF) {
+    auto MI = MBB.getLastNonDebugInstr();
+
+    unsigned JTOpcode;
+    switch (MI->getOpcode()) {
+    default:
+      continue;
+    case ARM::BR_JTadd:
+    case ARM::BR_JTr:
+    case ARM::tBR_JTr:
+    case ARM::BR_JTm:
+      JTOpcode = ARM::JUMPTABLE_ADDRS;
+      break;
+    case ARM::t2BR_JT:
+      JTOpcode = ARM::JUMPTABLE_INSTS;
+      break;
+    case ARM::t2TBB_JT:
+      JTOpcode = ARM::JUMPTABLE_TBB;
+      break;
+    case ARM::t2TBH_JT:
+      JTOpcode = ARM::JUMPTABLE_TBH;
+      break;
+    }
+
+    unsigned NumOps = MI->getDesc().getNumOperands();
+    MachineOperand JTOp =
+      MI->getOperand(NumOps - (MI->isPredicable() ? 2 : 1));
+    unsigned JTI = JTOp.getIndex();
+    unsigned Size = JT[JTI].MBBs.size() * sizeof(uint32_t);
+    MachineBasicBlock *JumpTableBB = MF->CreateMachineBasicBlock();
+    MF->insert(std::next(MachineFunction::iterator(MBB)), JumpTableBB);
+    MachineInstr *CPEMI = BuildMI(*JumpTableBB, JumpTableBB->begin(),
+                                  DebugLoc(), TII->get(JTOpcode))
+                              .addImm(i++)
+                              .addJumpTableIndex(JTI)
+                              .addImm(Size);
+    CPEMIs.push_back(CPEMI);
+    CPEntries.emplace_back(1, CPEntry(CPEMI, JTI));
+    JumpTableEntryIndices.insert(std::make_pair(JTI, CPEntries.size() - 1));
+    if (!LastCorrectlyNumberedBB)
+      LastCorrectlyNumberedBB = &MBB;
+  }
+
+  // If we did anything then we need to renumber the subsequent blocks.
+  if (LastCorrectlyNumberedBB)
+    MF->RenumberBlocks(LastCorrectlyNumberedBB);
+}
+
 /// BBHasFallthrough - Return true if the specified basic block can fallthrough
 /// into the block immediately after it.
 bool ARMConstantIslands::BBHasFallthrough(MachineBasicBlock *MBB) {
@@ -595,9 +674,21 @@ ARMConstantIslands::CPEntry
 /// getCPELogAlign - Returns the required alignment of the constant pool entry
 /// represented by CPEMI.  Alignment is measured in log2(bytes) units.
 unsigned ARMConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) {
-  assert(CPEMI && CPEMI->getOpcode() == ARM::CONSTPOOL_ENTRY);
+  switch (CPEMI->getOpcode()) {
+  case ARM::CONSTPOOL_ENTRY:
+    break;
+  case ARM::JUMPTABLE_TBB:
+    return 0;
+  case ARM::JUMPTABLE_TBH:
+  case ARM::JUMPTABLE_INSTS:
+    return 1;
+  case ARM::JUMPTABLE_ADDRS:
+    return 2;
+  default:
+    llvm_unreachable("unknown constpool entry kind");
+  }
 
-  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  unsigned CPI = getCombinedIndex(CPEMI);
   assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
   unsigned Align = MCP->getConstants()[CPI].getAlignment();
   assert(isPowerOf2_32(Align) && "Invalid CPE alignment");
@@ -706,12 +797,14 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
       if (Opc == ARM::tPUSH || Opc == ARM::tPOP_RET)
         PushPopMIs.push_back(I);
 
-      if (Opc == ARM::CONSTPOOL_ENTRY)
+      if (Opc == ARM::CONSTPOOL_ENTRY || Opc == ARM::JUMPTABLE_ADDRS ||
+          Opc == ARM::JUMPTABLE_INSTS || Opc == ARM::JUMPTABLE_TBB ||
+          Opc == ARM::JUMPTABLE_TBH)
         continue;
 
       // Scan the instructions for constant pool operands.
       for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
-        if (I->getOperand(op).isCPI()) {
+        if (I->getOperand(op).isCPI() || I->getOperand(op).isJTI()) {
           // We found one.  The addressing mode tells us the max displacement
           // from the PC that this instruction permits.
 
@@ -727,6 +820,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
 
           // Taking the address of a CP entry.
           case ARM::LEApcrel:
+          case ARM::LEApcrelJT:
             // This takes a SoImm, which is 8 bit immediate rotated. We'll
             // pretend the maximum offset is 255 * 4. Since each instruction
             // 4 byte wide, this is always correct. We'll check for other
@@ -737,10 +831,12 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
             IsSoImm = true;
             break;
           case ARM::t2LEApcrel:
+          case ARM::t2LEApcrelJT:
             Bits = 12;
             NegOk = true;
             break;
           case ARM::tLEApcrel:
+          case ARM::tLEApcrelJT:
             Bits = 8;
             Scale = 4;
             break;
@@ -768,6 +864,11 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
 
           // Remember that this is a user of a CP entry.
           unsigned CPI = I->getOperand(op).getIndex();
+          if (I->getOperand(op).isJTI()) {
+            JumpTableUserIndices.insert(std::make_pair(CPI, CPUsers.size()));
+            CPI = JumpTableEntryIndices[CPI];
+          }
+
           MachineInstr *CPEMI = CPEMIs[CPI];
           unsigned MaxOffs = ((1 << Bits)-1) * Scale;
           CPUsers.push_back(CPUser(I, CPEMI, MaxOffs, NegOk, IsSoImm));
@@ -1101,6 +1202,13 @@ bool ARMConstantIslands::decrementCPEReferenceCount(unsigned CPI,
   return false;
 }
 
+unsigned ARMConstantIslands::getCombinedIndex(const MachineInstr *CPEMI) {
+  if (CPEMI->getOperand(1).isCPI())
+    return CPEMI->getOperand(1).getIndex();
+
+  return JumpTableEntryIndices[CPEMI->getOperand(1).getIndex()];
+}
+
 /// LookForCPEntryInRange - see if the currently referenced CPE is in range;
 /// if not, see if an in-range clone of the CPE is in range, and if so,
 /// change the data structures so the user references the clone.  Returns:
@@ -1120,7 +1228,7 @@ int ARMConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
   }
 
   // No.  Look for previously created clones of the CPE that are in range.
-  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  unsigned CPI = getCombinedIndex(CPEMI);
   std::vector<CPEntry> &CPEs = CPEntries[CPI];
   for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
     // We already tried this one
@@ -1365,7 +1473,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
   CPUser &U = CPUsers[CPUserIndex];
   MachineInstr *UserMI = U.MI;
   MachineInstr *CPEMI  = U.CPEMI;
-  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  unsigned CPI = getCombinedIndex(CPEMI);
   unsigned Size = CPEMI->getOperand(2).getImm();
   // Compute this only once, it's expensive.
   unsigned UserOffset = getUserOffset(U);
@@ -1429,17 +1537,17 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
   // Update internal data structures to account for the newly inserted MBB.
   updateForInsertedWaterBlock(NewIsland);
 
-  // Decrement the old entry, and remove it if refcount becomes 0.
-  decrementCPEReferenceCount(CPI, CPEMI);
-
   // Now that we have an island to add the CPE to, clone the original CPE and
   // add it to the island.
   U.HighWaterMark = NewIsland;
-  U.CPEMI = BuildMI(NewIsland, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY))
-                .addImm(ID).addConstantPoolIndex(CPI).addImm(Size);
+  U.CPEMI = BuildMI(NewIsland, DebugLoc(), CPEMI->getDesc())
+                .addImm(ID).addOperand(CPEMI->getOperand(1)).addImm(Size);
   CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
   ++NumCPEs;
 
+  // Decrement the old entry, and remove it if refcount becomes 0.
+  decrementCPEReferenceCount(CPI, CPEMI);
+
   // Mark the basic block as aligned as required by the const-pool entry.
   NewIsland->setAlignment(getCPELogAlign(U.CPEMI));
 
@@ -1844,77 +1952,120 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
   return MadeChange;
 }
 
-/// If we've formed a TBB or TBH instruction, the base register is now
-/// redundant. In most cases, the instructions defining it will now be dead and
-/// can be tidied up. This function removes them if so, and returns the number
-/// of bytes saved.
-unsigned ARMConstantIslands::removeDeadDefinitions(MachineInstr *MI,
-                                                   unsigned BaseReg,
-                                                   unsigned IdxReg) {
-  unsigned BytesRemoved = 0;
-  MachineBasicBlock *MBB = MI->getParent();
+static bool isSimpleIndexCalc(MachineInstr &I, unsigned EntryReg,
+                              unsigned BaseReg) {
+  if (I.getOpcode() != ARM::t2ADDrs)
+    return false;
 
-  // Scan backwards to find the instruction that defines the base
-  // register. Due to post-RA scheduling, we can't count on it
-  // immediately preceding the branch instruction.
-  MachineBasicBlock::iterator PrevI = MI;
-  MachineBasicBlock::iterator B = MBB->begin();
-  while (PrevI != B && !PrevI->definesRegister(BaseReg))
-    --PrevI;
-
-  // If for some reason we didn't find it, we can't do anything, so
-  // just skip this one.
-  if (!PrevI->definesRegister(BaseReg) || PrevI->hasUnmodeledSideEffects() ||
-      PrevI->mayStore())
-    return BytesRemoved;
-
-  MachineInstr *AddrMI = PrevI;
-  unsigned NewBaseReg = BytesRemoved;
-
-  // Examine the instruction that calculates the jumptable entry address.  Make
-  // sure it only defines the base register and kills any uses other than the
-  // index register. We also need precisely one use to trace backwards to
-  // (hopefully) the LEA.
-  for (unsigned k = 0, eee = AddrMI->getNumOperands(); k != eee; ++k) {
-    const MachineOperand &MO = AddrMI->getOperand(k);
-    if (!MO.isReg() || !MO.getReg())
-      continue;
-    if (MO.isDef() && MO.getReg() != BaseReg)
-      return BytesRemoved;
+  if (I.getOperand(0).getReg() != EntryReg)
+    return false;
 
-    if (MO.isUse() && MO.getReg() != IdxReg) {
-      if (!MO.isKill() || (NewBaseReg != 0 && NewBaseReg != MO.getReg()))
-        return BytesRemoved;
-      NewBaseReg = MO.getReg();
+  if (I.getOperand(1).getReg() != BaseReg)
+    return false;
+
+  // FIXME: what about CC and IdxReg?
+  return true;
+}
+
+/// \brief While trying to form a TBB/TBH instruction, we may (if the table
+/// doesn't immediately follow the BR_JT) need access to the start of the
+/// jump-table. We know one instruction that produces such a register; this
+/// function works out whether that definition can be preserved to the BR_JT,
+/// possibly by removing an intervening addition (which is usually needed to
+/// calculate the actual entry to jump to).
+bool ARMConstantIslands::preserveBaseRegister(MachineInstr *JumpMI,
+                                              MachineInstr *LEAMI,
+                                              unsigned &DeadSize,
+                                              bool &CanDeleteLEA,
+                                              bool &BaseRegKill) {
+  if (JumpMI->getParent() != LEAMI->getParent())
+    return false;
+
+  // Now we hope that we have at least these instructions in the basic block:
+  //     BaseReg = t2LEA ...
+  //     [...]
+  //     EntryReg = t2ADDrs BaseReg, ...
+  //     [...]
+  //     t2BR_JT EntryReg
+  //
+  // We have to be very conservative about what we recognise here though. The
+  // main perturbing factors to watch out for are:
+  //    + Spills at any point in the chain: not direct problems but we would
+  //      expect a blocking Def of the spilled register so in practice what we
+  //      can do is limited.
+  //    + EntryReg == BaseReg: this is the one situation we should allow a Def
+  //      of BaseReg, but only if the t2ADDrs can be removed.
+  //    + Some instruction other than t2ADDrs computing the entry. Not seen in
+  //      the wild, but we should be careful.
+  unsigned EntryReg = JumpMI->getOperand(0).getReg();
+  unsigned BaseReg = LEAMI->getOperand(0).getReg();
+
+  CanDeleteLEA = true;
+  BaseRegKill = false;
+  MachineInstr *RemovableAdd = nullptr;
+  MachineBasicBlock::iterator I(LEAMI);
+  for (++I; &*I != JumpMI; ++I) {
+    if (isSimpleIndexCalc(*I, EntryReg, BaseReg)) {
+      RemovableAdd = &*I;
+      break;
+    }
+
+    for (unsigned K = 0, E = I->getNumOperands(); K != E; ++K) {
+      const MachineOperand &MO = I->getOperand(K);
+      if (!MO.isReg() || !MO.getReg())
+        continue;
+      if (MO.isDef() && MO.getReg() == BaseReg)
+        return false;
+      if (MO.isUse() && MO.getReg() == BaseReg) {
+        BaseRegKill = BaseRegKill || MO.isKill();
+        CanDeleteLEA = false;
+      }
+    }
+  }
+
+  if (!RemovableAdd)
+    return true;
+
+  // Check the add really is removable, and that nothing else in the block
+  // clobbers BaseReg.
+  for (++I; &*I != JumpMI; ++I) {
+    for (unsigned K = 0, E = I->getNumOperands(); K != E; ++K) {
+      const MachineOperand &MO = I->getOperand(K);
+      if (!MO.isReg() || !MO.getReg())
+        continue;
+      if (MO.isDef() && MO.getReg() == BaseReg)
+        return false;
+      if (MO.isUse() && MO.getReg() == EntryReg)
+        RemovableAdd = nullptr;
     }
   }
 
-  // Want to continue searching for AddrMI, but there are 2 problems: AddrMI is
-  // going away soon, and even decrementing once may be invalid.
-  if (PrevI != B)
-    PrevI = std::prev(PrevI);
-
-  DEBUG(dbgs() << "remove addr: " << *AddrMI);
-  BytesRemoved += TII->GetInstSizeInBytes(AddrMI);
-  AddrMI->eraseFromParent();
-
-  // Now scan back again to find the tLEApcrel or t2LEApcrelJT instruction
-  // that gave us the initial base register definition.
-  for (; PrevI != B && !PrevI->definesRegister(NewBaseReg); --PrevI)
-    ;
-
-  // The instruction should be a tLEApcrel or t2LEApcrelJT; we want
-  // to delete it as well.
-  MachineInstr *LeaMI = PrevI;
-  if ((LeaMI->getOpcode() != ARM::tLEApcrelJT &&
-       LeaMI->getOpcode() != ARM::t2LEApcrelJT) ||
-      LeaMI->getOperand(0).getReg() != NewBaseReg)
-    return BytesRemoved;
-
-  DEBUG(dbgs() << "remove lea: " << *LeaMI);
-  BytesRemoved += TII->GetInstSizeInBytes(LeaMI);
-  LeaMI->eraseFromParent();
-  return BytesRemoved;
+  if (RemovableAdd) {
+    RemovableAdd->eraseFromParent();
+    DeadSize += 4;
+  } else if (BaseReg == EntryReg) {
+    // The add wasn't removable, but clobbered the base for the TBB. So we can't
+    // preserve it.
+    return false;
+  }
+
+  // We reached the end of the block without seeing another definition of
+  // BaseReg (except, possibly the t2ADDrs, which was removed). BaseReg can be
+  // used in the TBB/TBH if necessary.
+  return true;
+}
+
+/// \brief Returns whether CPEMI is the first instruction in the block
+/// immediately following JTMI (assumed to be a TBB or TBH terminator). If so,
+/// we can switch the first register to PC and usually remove the address
+/// calculation that preceeded it.
+static bool jumpTableFollowsTB(MachineInstr *JTMI, MachineInstr *CPEMI) {
+  MachineFunction::iterator MBB = JTMI->getParent();
+  MachineFunction *MF = MBB->getParent();
+  ++MBB;
+
+  return MBB != MF->end() && MBB->begin() != MBB->end() &&
+         &*MBB->begin() == CPEMI;
 }
 
 /// optimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller
@@ -1955,37 +2106,79 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
         break;
     }
 
-    if (ByteOk || HalfWordOk) {
-      MachineBasicBlock *MBB = MI->getParent();
-      unsigned BaseReg = MI->getOperand(0).getReg();
-      bool BaseRegKill = MI->getOperand(0).isKill();
-      if (!BaseRegKill)
-        continue;
-      unsigned IdxReg = MI->getOperand(1).getReg();
-      bool IdxRegKill = MI->getOperand(1).isKill();
+    if (!ByteOk && !HalfWordOk)
+      continue;
 
-      DEBUG(dbgs() << "Shrink JT: " << *MI);
-      unsigned Opc = ByteOk ? ARM::t2TBB_JT : ARM::t2TBH_JT;
-      MachineBasicBlock::iterator MI_JT = MI;
-      MachineInstr *NewJTMI =
-        BuildMI(*MBB, MI_JT, MI->getDebugLoc(), TII->get(Opc))
-        .addReg(IdxReg, getKillRegState(IdxRegKill))
-        .addJumpTableIndex(JTI, JTOP.getTargetFlags());
-      DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": " << *NewJTMI);
-      // FIXME: Insert an "ALIGN" instruction to ensure the next instruction
-      // is 2-byte aligned. For now, asm printer will fix it up.
-      unsigned NewSize = TII->GetInstSizeInBytes(NewJTMI);
-      unsigned OrigSize = TII->GetInstSizeInBytes(MI);
-      unsigned DeadSize = removeDeadDefinitions(MI, BaseReg, IdxReg);
-      MI->eraseFromParent();
+    MachineBasicBlock *MBB = MI->getParent();
+    if (!MI->getOperand(0).isKill()) // FIXME: needed now?
+      continue;
+    unsigned IdxReg = MI->getOperand(1).getReg();
+    bool IdxRegKill = MI->getOperand(1).isKill();
 
-      int delta = OrigSize - NewSize + DeadSize;
-      BBInfo[MBB->getNumber()].Size -= delta;
-      adjustBBOffsetsAfter(MBB);
+    CPUser &User = CPUsers[JumpTableUserIndices[JTI]];
+    unsigned DeadSize = 0;
+    bool CanDeleteLEA = false;
+    bool BaseRegKill = false;
+    bool PreservedBaseReg =
+        preserveBaseRegister(MI, User.MI, DeadSize, CanDeleteLEA, BaseRegKill);
 
-      ++NumTBs;
-      MadeChange = true;
+    if (!jumpTableFollowsTB(MI, User.CPEMI) && !PreservedBaseReg)
+      continue;
+
+    DEBUG(dbgs() << "Shrink JT: " << *MI);
+    MachineInstr *CPEMI = User.CPEMI;
+    unsigned Opc = ByteOk ? ARM::t2TBB_JT : ARM::t2TBH_JT;
+    MachineBasicBlock::iterator MI_JT = MI;
+    MachineInstr *NewJTMI =
+        BuildMI(*MBB, MI_JT, MI->getDebugLoc(), TII->get(Opc))
+            .addReg(User.MI->getOperand(0).getReg(),
+                    getKillRegState(BaseRegKill))
+            .addReg(IdxReg, getKillRegState(IdxRegKill))
+            .addJumpTableIndex(JTI, JTOP.getTargetFlags())
+            .addImm(CPEMI->getOperand(0).getImm());
+    DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": " << *NewJTMI);
+
+    unsigned JTOpc = ByteOk ? ARM::JUMPTABLE_TBB : ARM::JUMPTABLE_TBH;
+    CPEMI->setDesc(TII->get(JTOpc));
+
+    if (jumpTableFollowsTB(MI, User.CPEMI)) {
+      NewJTMI->getOperand(0).setReg(ARM::PC);
+      NewJTMI->getOperand(0).setIsKill(false);
+
+      if (CanDeleteLEA)  {
+        User.MI->eraseFromParent();
+        DeadSize += 4;
+
+        // The LEA was eliminated, the TBB instruction becomes the only new user
+        // of the jump table.
+        User.MI = NewJTMI;
+        User.MaxDisp = 4;
+        User.NegOk = false;
+        User.IsSoImm = false;
+        User.KnownAlignment = false;
+      } else {
+        // The LEA couldn't be eliminated, so we must add another CPUser to
+        // record the TBB or TBH use.
+        int CPEntryIdx = JumpTableEntryIndices[JTI];
+        auto &CPEs = CPEntries[CPEntryIdx];
+        auto Entry = std::find_if(CPEs.begin(), CPEs.end(), [&](CPEntry &E) {
+          return E.CPEMI == User.CPEMI;
+        });
+        ++Entry->RefCount;
+        CPUsers.emplace_back(CPUser(NewJTMI, User.CPEMI, 4, false, false));
+      }
     }
+
+    unsigned NewSize = TII->GetInstSizeInBytes(NewJTMI);
+    unsigned OrigSize = TII->GetInstSizeInBytes(MI);
+    MI->eraseFromParent();
+
+    int Delta = OrigSize - NewSize + DeadSize;
+    BBInfo[MBB->getNumber()].Size -= Delta;
+    adjustBBOffsetsAfter(MBB);
+
+    ++NumTBs;
+    MadeChange = true;
   }
 
   return MadeChange;
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 4405625e47cd..50afb192b331 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -15,6 +15,7 @@
 #include "ARMBaseInstrInfo.h"
 #include "ARMTargetMachine.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -251,6 +252,9 @@ private:
   // Select special operations if node forms integer ABS pattern
   SDNode *SelectABSOp(SDNode *N);
 
+  SDNode *SelectReadRegister(SDNode *N);
+  SDNode *SelectWriteRegister(SDNode *N);
+
   SDNode *SelectInlineAsm(SDNode *N);
 
   SDNode *SelectConcatVector(SDNode *N);
@@ -2457,6 +2461,18 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 
   switch (N->getOpcode()) {
   default: break;
+  case ISD::WRITE_REGISTER: {
+    SDNode *ResNode = SelectWriteRegister(N);
+    if (ResNode)
+      return ResNode;
+    break;
+  }
+  case ISD::READ_REGISTER: {
+    SDNode *ResNode = SelectReadRegister(N);
+    if (ResNode)
+      return ResNode;
+    break;
+  }
   case ISD::INLINEASM: {
     SDNode *ResNode = SelectInlineAsm(N);
     if (ResNode)
@@ -3336,6 +3352,418 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   return SelectCode(N);
 }
 
+// Inspect a register string of the form
+// cp<coprocessor>:<opc1>:c<CRn>:c<CRm>:<opc2> (32bit) or
+// cp<coprocessor>:<opc1>:c<CRm> (64bit) inspect the fields of the string
+// and obtain the integer operands from them, adding these operands to the
+// provided vector.
+static void getIntOperandsFromRegisterString(StringRef RegString,
+                                             SelectionDAG *CurDAG, SDLoc DL,
+                                             std::vector<SDValue>& Ops) {
+  SmallVector<StringRef, 5> Fields;
+  RegString.split(Fields, ":");
+
+  if (Fields.size() > 1) {
+    bool AllIntFields = true;
+
+    for (StringRef Field : Fields) {
+      // Need to trim out leading 'cp' characters and get the integer field.
+      unsigned IntField;
+      AllIntFields &= !Field.trim("CPcp").getAsInteger(10, IntField);
+      Ops.push_back(CurDAG->getTargetConstant(IntField, DL, MVT::i32));
+    }
+
+    assert(AllIntFields &&
+            "Unexpected non-integer value in special register string.");
+  }
+}
+
+// Maps a Banked Register string to its mask value. The mask value returned is
+// for use in the MRSbanked / MSRbanked instruction nodes as the Banked Register
+// mask operand, which expresses which register is to be used, e.g. r8, and in
+// which mode it is to be used, e.g. usr. Returns -1 to signify that the string
+// was invalid.
+static inline int getBankedRegisterMask(StringRef RegString) {
+  return StringSwitch<int>(RegString.lower())
+          .Case("r8_usr", 0x00)
+          .Case("r9_usr", 0x01)
+          .Case("r10_usr", 0x02)
+          .Case("r11_usr", 0x03)
+          .Case("r12_usr", 0x04)
+          .Case("sp_usr", 0x05)
+          .Case("lr_usr", 0x06)
+          .Case("r8_fiq", 0x08)
+          .Case("r9_fiq", 0x09)
+          .Case("r10_fiq", 0x0a)
+          .Case("r11_fiq", 0x0b)
+          .Case("r12_fiq", 0x0c)
+          .Case("sp_fiq", 0x0d)
+          .Case("lr_fiq", 0x0e)
+          .Case("lr_irq", 0x10)
+          .Case("sp_irq", 0x11)
+          .Case("lr_svc", 0x12)
+          .Case("sp_svc", 0x13)
+          .Case("lr_abt", 0x14)
+          .Case("sp_abt", 0x15)
+          .Case("lr_und", 0x16)
+          .Case("sp_und", 0x17)
+          .Case("lr_mon", 0x1c)
+          .Case("sp_mon", 0x1d)
+          .Case("elr_hyp", 0x1e)
+          .Case("sp_hyp", 0x1f)
+          .Case("spsr_fiq", 0x2e)
+          .Case("spsr_irq", 0x30)
+          .Case("spsr_svc", 0x32)
+          .Case("spsr_abt", 0x34)
+          .Case("spsr_und", 0x36)
+          .Case("spsr_mon", 0x3c)
+          .Case("spsr_hyp", 0x3e)
+          .Default(-1);
+}
+
+// Maps a MClass special register string to its value for use in the
+// t2MRS_M / t2MSR_M instruction nodes as the SYSm value operand.
+// Returns -1 to signify that the string was invalid.
+static inline int getMClassRegisterSYSmValueMask(StringRef RegString) {
+  return StringSwitch<int>(RegString.lower())
+          .Case("apsr", 0x0)
+          .Case("iapsr", 0x1)
+          .Case("eapsr", 0x2)
+          .Case("xpsr", 0x3)
+          .Case("ipsr", 0x5)
+          .Case("epsr", 0x6)
+          .Case("iepsr", 0x7)
+          .Case("msp", 0x8)
+          .Case("psp", 0x9)
+          .Case("primask", 0x10)
+          .Case("basepri", 0x11)
+          .Case("basepri_max", 0x12)
+          .Case("faultmask", 0x13)
+          .Case("control", 0x14)
+          .Default(-1);
+}
+
+// The flags here are common to those allowed for apsr in the A class cores and
+// those allowed for the special registers in the M class cores. Returns a
+// value representing which flags were present, -1 if invalid.
+static inline int getMClassFlagsMask(StringRef Flags) {
+  if (Flags.empty())
+    return 0x3;
+
+  return StringSwitch<int>(Flags)
+          .Case("g", 0x1)
+          .Case("nzcvq", 0x2)
+          .Case("nzcvqg", 0x3)
+          .Default(-1);
+}
+
+static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead,
+                                 const ARMSubtarget *Subtarget) {
+  // Ensure that the register (without flags) was a valid M Class special
+  // register.
+  int SYSmvalue = getMClassRegisterSYSmValueMask(Reg);
+  if (SYSmvalue == -1)
+    return -1;
+
+  // basepri, basepri_max and faultmask are only valid for V7m.
+  if (!Subtarget->hasV7Ops() && SYSmvalue >= 0x11 && SYSmvalue <= 0x13)
+    return -1;
+
+  // If it was a read then we won't be expecting flags and so at this point
+  // we can return the mask.
+  if (IsRead) {
+    assert (Flags.empty() && "Unexpected flags for reading M class register.");
+    return SYSmvalue;
+  }
+
+  // We know we are now handling a write so need to get the mask for the flags.
+  int Mask = getMClassFlagsMask(Flags);
+
+  // Only apsr, iapsr, eapsr, xpsr can have flags. The other register values
+  // shouldn't have flags present.
+  if ((SYSmvalue < 0x4 && Mask == -1) || (SYSmvalue > 0x4 && !Flags.empty()))
+    return -1;
+
+  // The _g and _nzcvqg versions are only valid if the DSP extension is
+  // available.
+  if (!Subtarget->hasThumb2DSP() && (Mask & 0x2))
+    return -1;
+
+  // The register was valid so need to put the mask in the correct place
+  // (the flags need to be in bits 11-10) and combine with the SYSmvalue to
+  // construct the operand for the instruction node.
+  if (SYSmvalue < 0x4)
+    return SYSmvalue | Mask << 10;
+
+  return SYSmvalue;
+}
+
+static int getARClassRegisterMask(StringRef Reg, StringRef Flags) {
+  // The mask operand contains the special register (R Bit) in bit 4, whether
+  // the register is spsr (R bit is 1) or one of cpsr/apsr (R bit is 0), and
+  // bits 3-0 contains the fields to be accessed in the special register, set by
+  // the flags provided with the register.
+  int Mask = 0;
+  if (Reg == "apsr") {
+    // The flags permitted for apsr are the same flags that are allowed in
+    // M class registers. We get the flag value and then shift the flags into
+    // the correct place to combine with the mask.
+    Mask = getMClassFlagsMask(Flags);
+    if (Mask == -1)
+      return -1;
+    return Mask << 2;
+  }
+
+  if (Reg != "cpsr" && Reg != "spsr") {
+    return -1;
+  }
+
+  // This is the same as if the flags were "fc"
+  if (Flags.empty() || Flags == "all")
+    return Mask | 0x9;
+
+  // Inspect the supplied flags string and set the bits in the mask for
+  // the relevant and valid flags allowed for cpsr and spsr.
+  for (char Flag : Flags) {
+    int FlagVal;
+    switch (Flag) {
+      case 'c':
+        FlagVal = 0x1;
+        break;
+      case 'x':
+        FlagVal = 0x2;
+        break;
+      case 's':
+        FlagVal = 0x4;
+        break;
+      case 'f':
+        FlagVal = 0x8;
+        break;
+      default:
+        FlagVal = 0;
+    }
+
+    // This avoids allowing strings where the same flag bit appears twice.
+    if (!FlagVal || (Mask & FlagVal))
+      return -1;
+    Mask |= FlagVal;
+  }
+
+  // If the register is spsr then we need to set the R bit.
+  if (Reg == "spsr")
+    Mask |= 0x10;
+
+  return Mask;
+}
+
+// Lower the read_register intrinsic to ARM specific DAG nodes
+// using the supplied metadata string to select the instruction node to use
+// and the registers/masks to construct as operands for the node.
+SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){
+  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
+  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  bool IsThumb2 = Subtarget->isThumb2();
+  SDLoc DL(N);
+
+  std::vector<SDValue> Ops;
+  getIntOperandsFromRegisterString(RegString->getString(), CurDAG, DL, Ops);
+
+  if (!Ops.empty()) {
+    // If the special register string was constructed of fields (as defined
+    // in the ACLE) then need to lower to MRC node (32 bit) or
+    // MRRC node(64 bit), we can make the distinction based on the number of
+    // operands we have.
+    unsigned Opcode;
+    SmallVector<EVT, 3> ResTypes;
+    if (Ops.size() == 5){
+      Opcode = IsThumb2 ? ARM::t2MRC : ARM::MRC;
+      ResTypes.append({ MVT::i32, MVT::Other });
+    } else {
+      assert(Ops.size() == 3 &&
+              "Invalid number of fields in special register string.");
+      Opcode = IsThumb2 ? ARM::t2MRRC : ARM::MRRC;
+      ResTypes.append({ MVT::i32, MVT::i32, MVT::Other });
+    }
+
+    Ops.push_back(getAL(CurDAG, DL));
+    Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+    Ops.push_back(N->getOperand(0));
+    return CurDAG->getMachineNode(Opcode, DL, ResTypes, Ops);
+  }
+
+  std::string SpecialReg = RegString->getString().lower();
+
+  int BankedReg = getBankedRegisterMask(SpecialReg);
+  if (BankedReg != -1) {
+    Ops = { CurDAG->getTargetConstant(BankedReg, DL, MVT::i32),
+            getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+            N->getOperand(0) };
+    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSbanked : ARM::MRSbanked,
+                                  DL, MVT::i32, MVT::Other, Ops);
+  }
+
+  // The VFP registers are read by creating SelectionDAG nodes with opcodes
+  // corresponding to the register that is being read from. So we switch on the
+  // string to find which opcode we need to use.
+  unsigned Opcode = StringSwitch<unsigned>(SpecialReg)
+                    .Case("fpscr", ARM::VMRS)
+                    .Case("fpexc", ARM::VMRS_FPEXC)
+                    .Case("fpsid", ARM::VMRS_FPSID)
+                    .Case("mvfr0", ARM::VMRS_MVFR0)
+                    .Case("mvfr1", ARM::VMRS_MVFR1)
+                    .Case("mvfr2", ARM::VMRS_MVFR2)
+                    .Case("fpinst", ARM::VMRS_FPINST)
+                    .Case("fpinst2", ARM::VMRS_FPINST2)
+                    .Default(0);
+
+  // If an opcode was found then we can lower the read to a VFP instruction.
+  if (Opcode) {
+    if (!Subtarget->hasVFP2())
+      return nullptr;
+    if (Opcode == ARM::VMRS_MVFR2 && !Subtarget->hasFPARMv8())
+      return nullptr;
+
+    Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+            N->getOperand(0) };
+    return CurDAG->getMachineNode(Opcode, DL, MVT::i32, MVT::Other, Ops);
+  }
+
+  // If the target is M Class then need to validate that the register string
+  // is an acceptable value, so check that a mask can be constructed from the
+  // string.
+  if (Subtarget->isMClass()) {
+    int SYSmValue = getMClassRegisterMask(SpecialReg, "", true, Subtarget);
+    if (SYSmValue == -1)
+      return nullptr;
+
+    SDValue Ops[] = { CurDAG->getTargetConstant(SYSmValue, DL, MVT::i32),
+                      getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+                      N->getOperand(0) };
+    return CurDAG->getMachineNode(ARM::t2MRS_M, DL, MVT::i32, MVT::Other, Ops);
+  }
+
+  // Here we know the target is not M Class so we need to check if it is one
+  // of the remaining possible values which are apsr, cpsr or spsr.
+  if (SpecialReg == "apsr" || SpecialReg == "cpsr") {
+    Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+            N->getOperand(0) };
+    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRS_AR : ARM::MRS, DL,
+                                  MVT::i32, MVT::Other, Ops);
+  }
+
+  if (SpecialReg == "spsr") {
+    Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+            N->getOperand(0) };
+    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSsys_AR : ARM::MRSsys,
+                                  DL, MVT::i32, MVT::Other, Ops);
+  }
+
+  return nullptr;
+}
+
+// Lower the write_register intrinsic to ARM specific DAG nodes
+// using the supplied metadata string to select the instruction node to use
+// and the registers/masks to use in the nodes
+SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
+  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
+  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  bool IsThumb2 = Subtarget->isThumb2();
+  SDLoc DL(N);
+
+  std::vector<SDValue> Ops;
+  getIntOperandsFromRegisterString(RegString->getString(), CurDAG, DL, Ops);
+
+  if (!Ops.empty()) {
+    // If the special register string was constructed of fields (as defined
+    // in the ACLE) then need to lower to MCR node (32 bit) or
+    // MCRR node(64 bit), we can make the distinction based on the number of
+    // operands we have.
+    unsigned Opcode;
+    if (Ops.size() == 5) {
+      Opcode = IsThumb2 ? ARM::t2MCR : ARM::MCR;
+      Ops.insert(Ops.begin()+2, N->getOperand(2));
+    } else {
+      assert(Ops.size() == 3 &&
+              "Invalid number of fields in special register string.");
+      Opcode = IsThumb2 ? ARM::t2MCRR : ARM::MCRR;
+      SDValue WriteValue[] = { N->getOperand(2), N->getOperand(3) };
+      Ops.insert(Ops.begin()+2, WriteValue, WriteValue+2);
+    }
+
+    Ops.push_back(getAL(CurDAG, DL));
+    Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+    Ops.push_back(N->getOperand(0));
+
+    return CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
+  }
+
+  std::string SpecialReg = RegString->getString().lower();
+  int BankedReg = getBankedRegisterMask(SpecialReg);
+  if (BankedReg != -1) {
+    Ops = { CurDAG->getTargetConstant(BankedReg, DL, MVT::i32), N->getOperand(2),
+            getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+            N->getOperand(0) };
+    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSRbanked : ARM::MSRbanked,
+                                  DL, MVT::Other, Ops);
+  }
+
+  // The VFP registers are written to by creating SelectionDAG nodes with
+  // opcodes corresponding to the register that is being written. So we switch
+  // on the string to find which opcode we need to use.
+  unsigned Opcode = StringSwitch<unsigned>(SpecialReg)
+                    .Case("fpscr", ARM::VMSR)
+                    .Case("fpexc", ARM::VMSR_FPEXC)
+                    .Case("fpsid", ARM::VMSR_FPSID)
+                    .Case("fpinst", ARM::VMSR_FPINST)
+                    .Case("fpinst2", ARM::VMSR_FPINST2)
+                    .Default(0);
+
+  if (Opcode) {
+    if (!Subtarget->hasVFP2())
+      return nullptr;
+    Ops = { N->getOperand(2), getAL(CurDAG, DL),
+            CurDAG->getRegister(0, MVT::i32), N->getOperand(0) };
+    return CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
+  }
+
+  SmallVector<StringRef, 5> Fields;
+  StringRef(SpecialReg).split(Fields, "_", 1, false);
+  std::string Reg = Fields[0].str();
+  StringRef Flags = Fields.size() == 2 ? Fields[1] : "";
+
+  // If the target was M Class then need to validate the special register value
+  // and retrieve the mask for use in the instruction node.
+  if (Subtarget->isMClass()) {
+    // basepri_max gets split so need to correct Reg and Flags.
+    if (SpecialReg == "basepri_max") {
+      Reg = SpecialReg;
+      Flags = "";
+    }
+    int SYSmValue = getMClassRegisterMask(Reg, Flags, false, Subtarget);
+    if (SYSmValue == -1)
+      return nullptr;
+
+    SDValue Ops[] = { CurDAG->getTargetConstant(SYSmValue, DL, MVT::i32),
+                      N->getOperand(2), getAL(CurDAG, DL),
+                      CurDAG->getRegister(0, MVT::i32), N->getOperand(0) };
+    return CurDAG->getMachineNode(ARM::t2MSR_M, DL, MVT::Other, Ops);
+  }
+
+  // We then check to see if a valid mask can be constructed for one of the
+  // register string values permitted for the A and R class cores. These values
+  // are apsr, spsr and cpsr; these are also valid on older cores.
+  int Mask = getARClassRegisterMask(Reg, Flags);
+  if (Mask != -1) {
+    Ops = { CurDAG->getTargetConstant(Mask, DL, MVT::i32), N->getOperand(2),
+            getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
+            N->getOperand(0) };
+    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSR_AR : ARM::MSR,
+                                  DL, MVT::Other, Ops);
+  }
+
+  return nullptr;
+}
+
 SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
   std::vector<SDValue> AsmNodeOperands;
   unsigned Flag, Kind;
@@ -3492,13 +3920,29 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
 bool ARMDAGToDAGISel::
 SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
                              std::vector<SDValue> &OutOps) {
-  assert(ConstraintID == InlineAsm::Constraint_m &&
-         "unexpected asm memory constraint");
-  // Require the address to be in a register.  That is safe for all ARM
-  // variants and it is hard to do anything much smarter without knowing
-  // how the operand is used.
-  OutOps.push_back(Op);
-  return false;
+  switch(ConstraintID) {
+  default:
+    llvm_unreachable("Unexpected asm memory constraint");
+  case InlineAsm::Constraint_i:
+    // FIXME: It seems strange that 'i' is needed here since it's supposed to
+    //        be an immediate and not a memory constraint.
+    // Fallthrough.
+  case InlineAsm::Constraint_m:
+  case InlineAsm::Constraint_Q:
+  case InlineAsm::Constraint_Um:
+  case InlineAsm::Constraint_Un:
+  case InlineAsm::Constraint_Uq:
+  case InlineAsm::Constraint_Us:
+  case InlineAsm::Constraint_Ut:
+  case InlineAsm::Constraint_Uv:
+  case InlineAsm::Constraint_Uy:
+    // Require the address to be in a register.  That is safe for all ARM
+    // variants and it is hard to do anything much smarter without knowing
+    // how the operand is used.
+    OutOps.push_back(Op);
+    return false;
+  }
+  return true;
 }
 
 /// createARMISelDag - This pass converts a legalized DAG into a
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 629cc90d67de..47c8400a668f 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -426,6 +426,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
 
+  setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
+  setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
+
   if (Subtarget->hasNEON()) {
     addDRTypeForNEON(MVT::v2f32);
     addDRTypeForNEON(MVT::v8i8);
@@ -2378,6 +2381,24 @@ bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
   return !Subtarget->isThumb1Only();
 }
 
+// Trying to write a 64 bit value so need to split into two 32 bit values first,
+// and pass the lower and high parts through.
+static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  SDValue WriteValue = Op->getOperand(2);
+
+  // This function is only supposed to be called for i64 type argument.
+  assert(WriteValue.getValueType() == MVT::i64
+          && "LowerWRITE_REGISTER called for non-i64 type argument.");
+
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
+                           DAG.getConstant(0, DL, MVT::i32));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
+                           DAG.getConstant(1, DL, MVT::i32));
+  SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
+  return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
+}
+
 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
 // one of the above mentioned nodes. It has to be wrapped because otherwise
@@ -4085,7 +4106,28 @@ unsigned ARMTargetLowering::getRegisterByName(const char* RegName,
                        .Default(0);
   if (Reg)
     return Reg;
-  report_fatal_error("Invalid register name global variable");
+  report_fatal_error(Twine("Invalid register name \""
+                              + StringRef(RegName)  + "\"."));
+}
+
+// Result is 64 bit value so split into two 32 bit values and return as a
+// pair of values.
+static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                SelectionDAG &DAG) {
+  SDLoc DL(N);
+
+  // This function is only supposed to be called for i64 type destination.
+  assert(N->getValueType(0) == MVT::i64
+          && "ExpandREAD_REGISTER called for non-i64 type result.");
+
+  SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
+                             DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
+                             N->getOperand(0),
+                             N->getOperand(1));
+
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
+                    Read.getValue(1)));
+  Results.push_back(Read.getOperand(0));
 }
 
 /// ExpandBITCAST - If the target supports VFP, this function is called to
@@ -6355,6 +6397,7 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N,
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Don't know how to custom lower this!");
+  case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
   case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
   case ISD::GlobalAddress:
@@ -6439,6 +6482,9 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Don't know how to custom expand this!");
+  case ISD::READ_REGISTER:
+    ExpandREAD_REGISTER(N, Results, DAG);
+    break;
   case ISD::BITCAST:
     Res = ExpandBITCAST(N, DAG);
     break;
@@ -10222,7 +10268,8 @@ bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
 bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              Type *Ty) const {
+                                              Type *Ty,
+                                              unsigned AS) const {
   EVT VT = getValueType(Ty, true);
   if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
     return false;
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 63e87c5282d1..c0b329c5a1e5 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -286,7 +286,8 @@ namespace llvm {
 
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
-    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
+                               unsigned AS) const override;
     bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
@@ -346,8 +347,31 @@ namespace llvm {
 
     unsigned getInlineAsmMemConstraint(
         const std::string &ConstraintCode) const override {
-      // FIXME: Map different constraints differently.
-      return InlineAsm::Constraint_m;
+      if (ConstraintCode == "Q")
+        return InlineAsm::Constraint_Q;
+      else if (ConstraintCode.size() == 2) {
+        if (ConstraintCode[0] == 'U') {
+          switch(ConstraintCode[1]) {
+          default:
+            break;
+          case 'm':
+            return InlineAsm::Constraint_Um;
+          case 'n':
+            return InlineAsm::Constraint_Un;
+          case 'q':
+            return InlineAsm::Constraint_Uq;
+          case 's':
+            return InlineAsm::Constraint_Us;
+          case 't':
+            return InlineAsm::Constraint_Ut;
+          case 'v':
+            return InlineAsm::Constraint_Uv;
+          case 'y':
+            return InlineAsm::Constraint_Uy;
+          }
+        }
+      }
+      return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
     }
 
     const ARMSubtarget* getSubtarget() const {
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 778fd17137f6..b8cac135baf6 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -1826,6 +1826,32 @@ def CONSTPOOL_ENTRY :
 PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
                     i32imm:$size), NoItinerary, []>;
 
+/// A jumptable consisting of direct 32-bit addresses of the destination basic
+/// blocks (either absolute, or relative to the start of the jump-table in PIC
+/// mode). Used mostly in ARM and Thumb-1 modes.
+def JUMPTABLE_ADDRS :
+PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+                        i32imm:$size), NoItinerary, []>;
+
+/// A jumptable consisting of 32-bit jump instructions. Used for Thumb-2 tables
+/// that cannot be optimised to use TBB or TBH.
+def JUMPTABLE_INSTS :
+PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+                        i32imm:$size), NoItinerary, []>;
+
+/// A jumptable consisting of 8-bit unsigned integers representing offsets from
+/// a TBB instruction.
+def JUMPTABLE_TBB :
+PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+                        i32imm:$size), NoItinerary, []>;
+
+/// A jumptable consisting of 16-bit unsigned integers representing offsets from
+/// a TBH instruction.
+def JUMPTABLE_TBH :
+PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+                        i32imm:$size), NoItinerary, []>;
+
+
 // FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE
 // from removing one half of the matched pairs. That breaks PEI, which assumes
 // these will always be in pairs, and asserts if it finds otherwise. Better way?
@@ -2224,7 +2250,7 @@ let isBranch = 1, isTerminator = 1 in {
                 [(br bb:$target)], (Bcc br_target:$target, (ops 14, zero_reg))>,
                 Sched<[WriteBr]>;
 
-    let isNotDuplicable = 1, isIndirectBranch = 1 in {
+    let Size = 4, isNotDuplicable = 1, isIndirectBranch = 1 in {
     def BR_JTr : ARMPseudoInst<(outs),
                       (ins GPR:$target, i32imm:$jt),
                       0, IIC_Br,
@@ -5039,10 +5065,11 @@ def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn,
                               imm:$CRm, imm:$opc2),
                 (MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
 
-class MovRRCopro<string opc, bit direction, list<dag> pattern = []>
-  : ABI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1,
-        GPRnopc:$Rt, GPRnopc:$Rt2, c_imm:$CRm),
-        NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", pattern> {
+class MovRRCopro<string opc, bit direction, dag oops, dag iops, list<dag>
+                 pattern = []>
+  : ABI<0b1100, oops, iops, NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm",
+        pattern> {
+
   let Inst{23-21} = 0b010;
   let Inst{20} = direction;
 
@@ -5060,9 +5087,13 @@ class MovRRCopro<string opc, bit direction, list<dag> pattern = []>
 }
 
 def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */,
+                      (outs), (ins p_imm:$cop, imm0_15:$opc1, GPRnopc:$Rt,
+                      GPRnopc:$Rt2, c_imm:$CRm),
                       [(int_arm_mcrr imm:$cop, imm:$opc1, GPRnopc:$Rt,
                                      GPRnopc:$Rt2, imm:$CRm)]>;
-def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>;
+def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */,
+                      (outs GPRnopc:$Rt, GPRnopc:$Rt2),
+                      (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm), []>;
 
 class MovRRCopro2<string opc, bit direction, list<dag> pattern = []>
   : ABXI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1,
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 0fecfa1319d3..40414da3ca81 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -526,6 +526,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
                       0, IIC_Br,
                       [(ARMbrjt tGPR:$target, tjumptable:$jt)]>,
                       Sched<[WriteBrTbl]> {
+    let Size = 2;
     list<Predicate> Predicates = [IsThumb, IsThumb1Only];
   }
 }
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 814b524b2bcb..aba8a7b10fd9 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -3531,20 +3531,20 @@ def t2B   : T2I<(outs), (ins uncondbrtarget:$target), IIC_Br,
   let AsmMatchConverter = "cvtThumbBranches";
 }
 
-let isNotDuplicable = 1, isIndirectBranch = 1 in {
+let Size = 4, isNotDuplicable = 1, isIndirectBranch = 1 in {
 def t2BR_JT : t2PseudoInst<(outs),
           (ins GPR:$target, GPR:$index, i32imm:$jt),
            0, IIC_Br,
           [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt)]>,
           Sched<[WriteBr]>;
 
-// FIXME: Add a non-pc based case that can be predicated.
+// FIXME: Add a case that can be predicated.
 def t2TBB_JT : t2PseudoInst<(outs),
-        (ins GPR:$index, i32imm:$jt), 0, IIC_Br, []>,
+        (ins GPR:$base, GPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>,
         Sched<[WriteBr]>;
 
 def t2TBH_JT : t2PseudoInst<(outs),
-        (ins GPR:$index, i32imm:$jt), 0, IIC_Br, []>,
+        (ins GPR:$base, GPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>,
         Sched<[WriteBr]>;
 
 def t2TBB : T2I<(outs), (ins addrmode_tbb:$addr), IIC_Br,
@@ -4141,11 +4141,9 @@ class t2MovRCopro<bits<4> Op, string opc, bit direction, dag oops, dag iops,
   let Inst{19-16} = CRn;
 }
 
-class t2MovRRCopro<bits<4> Op, string opc, bit direction,
+class t2MovRRCopro<bits<4> Op, string opc, bit direction, dag oops, dag iops,
                    list<dag> pattern = []>
-  : T2Cop<Op, (outs),
-          (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
-          opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", pattern> {
+  : T2Cop<Op, oops, iops, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", pattern> {
   let Inst{27-24} = 0b1100;
   let Inst{23-21} = 0b010;
   let Inst{20} = direction;
@@ -4210,19 +4208,25 @@ def : T2v6Pat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
 
 
 /* from ARM core register to coprocessor */
-def t2MCRR : t2MovRRCopro<0b1110, "mcrr", 0,
+def t2MCRR : t2MovRRCopro<0b1110, "mcrr", 0, (outs),
+                         (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2,
+                         c_imm:$CRm),
                         [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2,
                                        imm:$CRm)]>;
-def t2MCRR2 : t2MovRRCopro<0b1111, "mcrr2", 0,
-                           [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt,
-                                           GPR:$Rt2, imm:$CRm)]> {
+def t2MCRR2 : t2MovRRCopro<0b1111, "mcrr2", 0, (outs),
+                          (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2,
+                           c_imm:$CRm),
+                          [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt,
+                                          GPR:$Rt2, imm:$CRm)]> {
   let Predicates = [IsThumb2, PreV8];
 }
 
 /* from coprocessor to ARM core register */
-def t2MRRC : t2MovRRCopro<0b1110, "mrrc", 1>;
+def t2MRRC : t2MovRRCopro<0b1110, "mrrc", 1, (outs GPR:$Rt, GPR:$Rt2),
+                          (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm)>;
 
-def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1> {
+def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1, (outs GPR:$Rt, GPR:$Rt2),
+                           (ins p_imm:$cop, imm0_15:$opc1, c_imm:$CRm)> {
   let Predicates = [IsThumb2, PreV8];
 }
 
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 5b62a21706ce..46ff326ba630 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains a pass that performs load / store related peephole
-// optimizations. This pass should be run after register allocation.
+/// \file This file contains a pass that performs load / store related peephole
+/// optimizations. This pass should be run after register allocation.
 //
 //===----------------------------------------------------------------------===//
 
@@ -58,10 +58,9 @@ STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
 STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
 STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
 
-/// ARMAllocLoadStoreOpt - Post- register allocation pass the combine
-/// load / store instructions to form ldm / stm instructions.
-
 namespace {
+  /// Post- register allocation pass the combine load / store instructions to
+  /// form ldm / stm instructions.
   struct ARMLoadStoreOpt : public MachineFunctionPass {
     static char ID;
     ARMLoadStoreOpt() : MachineFunctionPass(ID) {}
@@ -271,10 +270,7 @@ static int getLoadStoreMultipleOpcode(unsigned Opcode, ARM_AM::AMSubMode Mode) {
   }
 }
 
-namespace llvm {
-  namespace ARM_AM {
-
-AMSubMode getLoadStoreMultipleSubMode(unsigned Opcode) {
+static ARM_AM::AMSubMode getLoadStoreMultipleSubMode(unsigned Opcode) {
   switch (Opcode) {
   default: llvm_unreachable("Unhandled opcode!");
   case ARM::LDMIA_RET:
@@ -328,9 +324,6 @@ AMSubMode getLoadStoreMultipleSubMode(unsigned Opcode) {
   }
 }
 
-  } // end namespace ARM_AM
-} // end namespace llvm
-
 static bool isT1i32Load(unsigned Opc) {
   return Opc == ARM::tLDRi || Opc == ARM::tLDRspi;
 }
@@ -469,9 +462,9 @@ ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
   }
 }
 
-/// MergeOps - Create and insert a LDM or STM with Base as base register and
-/// registers in Regs as the register operands that would be loaded / stored.
-/// It returns true if the transformation is done.
+/// Create and insert a LDM or STM with Base as base register and registers in
+/// Regs as the register operands that would be loaded / stored.  It returns
+/// true if the transformation is done.
 bool
 ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MBBI,
@@ -665,7 +658,7 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
   return true;
 }
 
-/// \brief Find all instructions using a given imp-def within a range.
+/// Find all instructions using a given imp-def within a range.
 ///
 /// We are trying to combine a range of instructions, one of which (located at
 /// position RangeBegin) implicitly defines a register. The final LDM/STM will
@@ -721,8 +714,7 @@ void ARMLoadStoreOpt::findUsesOfImpDef(
   }
 }
 
-// MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on
-// success.
+/// Call MergeOps and update MemOps and merges accordingly on success.
 void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
                                      MemOpQueue &memOps,
                                      unsigned memOpsBegin, unsigned memOpsEnd,
@@ -762,10 +754,10 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
     Regs.push_back(std::make_pair(Reg, isKill));
 
     // Collect any implicit defs of super-registers. They must be preserved.
-    for (MIOperands MO(memOps[i].MBBI); MO.isValid(); ++MO) {
-      if (!MO->isReg() || !MO->isDef() || !MO->isImplicit() || MO->isDead())
+    for (const MachineOperand &MO : memOps[i].MBBI->operands()) {
+      if (!MO.isReg() || !MO.isDef() || !MO.isImplicit() || MO.isDead())
         continue;
-      unsigned DefReg = MO->getReg();
+      unsigned DefReg = MO.getReg();
       if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) == ImpDefs.end())
         ImpDefs.push_back(DefReg);
 
@@ -823,8 +815,8 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
   }
 }
 
-/// MergeLDR_STR - Merge a number of load / store instructions into one or more
-/// load / store multiple instructions.
+/// Merge a number of load / store instructions into one or more load / store
+/// multiple instructions.
 void
 ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
                          unsigned Base, unsigned Opcode, unsigned Size,
@@ -1083,8 +1075,8 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
   }
 }
 
-/// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base
-/// register into the LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
+/// Fold proceeding/trailing inc/dec of base register into the
+/// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
 ///
 /// stmia rn, <ra, rb, rc>
 /// rn := rn + 4 * 3;
@@ -1118,7 +1110,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
       return false;
 
   bool DoMerge = false;
-  ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(Opcode);
+  ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
 
   // Try merging with the previous instruction.
   MachineBasicBlock::iterator BeginMBBI = MBB.begin();
@@ -1231,8 +1223,8 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
   }
 }
 
-/// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base
-/// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible:
+/// Fold proceeding/trailing inc/dec of base register into the
+/// LDR/STR/FLD{D|S}/FST{D|S} op when possible:
 bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
                                                MachineBasicBlock::iterator MBBI,
                                                const TargetInstrInfo *TII,
@@ -1373,8 +1365,8 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
   return true;
 }
 
-/// isMemoryOp - Returns true if instruction is a memory operation that this
-/// pass is capable of operating on.
+/// Returns true if instruction is a memory operation that this pass is capable
+/// of operating on.
 static bool isMemoryOp(const MachineInstr *MI) {
   // When no memory operands are present, conservatively assume unaligned,
   // volatile, unfoldable.
@@ -1428,8 +1420,8 @@ static bool isMemoryOp(const MachineInstr *MI) {
   return false;
 }
 
-/// AdvanceRS - Advance register scavenger to just before the earliest memory
-/// op that is being merged.
+/// Advance register scavenger to just before the earliest memory op that is
+/// being merged.
 void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
   MachineBasicBlock::iterator Loc = MemOps[0].MBBI;
   unsigned Position = MemOps[0].Position;
@@ -1472,8 +1464,7 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator &MBBI) {
   MachineInstr *MI = &*MBBI;
   unsigned Opcode = MI->getOpcode();
-  if (Opcode == ARM::LDRD || Opcode == ARM::STRD ||
-      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) {
+  if (Opcode == ARM::LDRD || Opcode == ARM::STRD) {
     const MachineOperand &BaseOp = MI->getOperand(2);
     unsigned BaseReg = BaseOp.getReg();
     unsigned EvenReg = MI->getOperand(0).getReg();
@@ -1588,8 +1579,8 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
   return false;
 }
 
-/// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR
-/// ops of the same base and incrementing offset into LDM / STM ops.
+/// An optimization pass to turn multiple LDR / STR ops of the same base and
+/// incrementing offset into LDM / STM ops.
 bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
   unsigned NumMerges = 0;
   unsigned NumMemOps = 0;
@@ -1770,9 +1761,9 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
   return NumMerges > 0;
 }
 
-/// MergeReturnIntoLDM - If this is a exit BB, try merging the return ops
-/// ("bx lr" and "mov pc, lr") into the preceding stack restore so it
-/// directly restore the value of LR into pc.
+/// If this is a exit BB, try merging the return ops ("bx lr" and "mov pc, lr")
+/// into the preceding stack restore so it directly restore the value of LR
+/// into pc.
 ///   ldmfd sp!, {..., lr}
 ///   bx lr
 /// or
@@ -1834,12 +1825,9 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   return Modified;
 }
 
-
-/// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move
-/// load / stores from consecutive locations close to make it more
-/// likely they will be combined later.
-
 namespace {
+  /// Pre- register allocation pass that move load / stores from consecutive
+  /// locations close to make it more likely they will be combined later.
   struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
     static char ID;
     ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
@@ -1936,7 +1924,7 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
 }
 
 
-/// Copy Op0 and Op1 operands into a new array assigned to MI.
+/// Copy \p Op0 and \p Op1 operands into a new array assigned to MI.
 static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
                                    MachineInstr *Op1) {
   assert(MI->memoperands_empty() && "expected a new machineinstr");
@@ -1954,10 +1942,11 @@ static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
 
 bool
 ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
-                                          DebugLoc &dl,
-                                          unsigned &NewOpc, unsigned &EvenReg,
-                                          unsigned &OddReg, unsigned &BaseReg,
-                                          int &Offset, unsigned &PredReg,
+                                          DebugLoc &dl, unsigned &NewOpc,
+                                          unsigned &FirstReg,
+                                          unsigned &SecondReg,
+                                          unsigned &BaseReg, int &Offset,
+                                          unsigned &PredReg,
                                           ARMCC::CondCodes &Pred,
                                           bool &isT2) {
   // Make sure we're allowed to generate LDRD/STRD.
@@ -2016,9 +2005,9 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
       return false;
     Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
   }
-  EvenReg = Op0->getOperand(0).getReg();
-  OddReg  = Op1->getOperand(0).getReg();
-  if (EvenReg == OddReg)
+  FirstReg = Op0->getOperand(0).getReg();
+  SecondReg = Op1->getOperand(0).getReg();
+  if (FirstReg == SecondReg)
     return false;
   BaseReg = Op0->getOperand(1).getReg();
   Pred = getInstrPredicate(Op0, PredReg);
@@ -2114,7 +2103,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
         // to try to allocate a pair of registers that can form register pairs.
         MachineInstr *Op0 = Ops.back();
         MachineInstr *Op1 = Ops[Ops.size()-2];
-        unsigned EvenReg = 0, OddReg = 0;
+        unsigned FirstReg = 0, SecondReg = 0;
         unsigned BaseReg = 0, PredReg = 0;
         ARMCC::CondCodes Pred = ARMCC::AL;
         bool isT2 = false;
@@ -2122,21 +2111,21 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
         int Offset = 0;
         DebugLoc dl;
         if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
-                                             EvenReg, OddReg, BaseReg,
+                                             FirstReg, SecondReg, BaseReg,
                                              Offset, PredReg, Pred, isT2)) {
           Ops.pop_back();
           Ops.pop_back();
 
           const MCInstrDesc &MCID = TII->get(NewOpc);
           const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
-          MRI->constrainRegClass(EvenReg, TRC);
-          MRI->constrainRegClass(OddReg, TRC);
+          MRI->constrainRegClass(FirstReg, TRC);
+          MRI->constrainRegClass(SecondReg, TRC);
 
           // Form the pair instruction.
           if (isLd) {
             MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
-              .addReg(EvenReg, RegState::Define)
-              .addReg(OddReg, RegState::Define)
+              .addReg(FirstReg, RegState::Define)
+              .addReg(SecondReg, RegState::Define)
               .addReg(BaseReg);
             // FIXME: We're converting from LDRi12 to an insn that still
             // uses addrmode2, so we need an explicit offset reg. It should
@@ -2149,8 +2138,8 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
             ++NumLDRDFormed;
           } else {
             MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
-              .addReg(EvenReg)
-              .addReg(OddReg)
+              .addReg(FirstReg)
+              .addReg(SecondReg)
               .addReg(BaseReg);
             // FIXME: We're converting from LDRi12 to an insn that still
             // uses addrmode2, so we need an explicit offset reg. It should
@@ -2165,9 +2154,11 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
           MBB->erase(Op0);
           MBB->erase(Op1);
 
-          // Add register allocation hints to form register pairs.
-          MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg);
-          MRI->setRegAllocationHint(OddReg,  ARMRI::RegPairOdd, EvenReg);
+          if (!isT2) {
+            // Add register allocation hints to form register pairs.
+            MRI->setRegAllocationHint(FirstReg, ARMRI::RegPairEven, SecondReg);
+            MRI->setRegAllocationHint(SecondReg,  ARMRI::RegPairOdd, FirstReg);
+          }
         } else {
           for (unsigned i = 0; i != NumMove; ++i) {
             MachineInstr *Op = Ops.back();
@@ -2292,8 +2283,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
 }
 
 
-/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
-/// optimization pass.
+/// Returns an instance of the load / store optimization pass.
 FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
   if (PreAlloc)
     return new ARMPreAllocLoadStoreOpt();
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index e370b962ba7f..a2aca2d1a69e 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -30,35 +30,35 @@ MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
   unsigned Option = MO.getTargetFlags() & ARMII::MO_OPTION_MASK;
   switch (Option) {
   default: {
-    Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
+    Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None,
                                    OutContext);
     switch (Option) {
     default: llvm_unreachable("Unknown target flag on symbol operand");
     case ARMII::MO_NO_FLAG:
       break;
     case ARMII::MO_LO16:
-      Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
+      Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None,
                                      OutContext);
-      Expr = ARMMCExpr::CreateLower16(Expr, OutContext);
+      Expr = ARMMCExpr::createLower16(Expr, OutContext);
       break;
     case ARMII::MO_HI16:
-      Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
+      Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None,
                                      OutContext);
-      Expr = ARMMCExpr::CreateUpper16(Expr, OutContext);
+      Expr = ARMMCExpr::createUpper16(Expr, OutContext);
       break;
     }
     break;
   }
 
   case ARMII::MO_PLT:
-    Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_PLT,
+    Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_PLT,
                                    OutContext);
     break;
   }
 
   if (!MO.isJTI() && MO.getOffset())
-    Expr = MCBinaryExpr::CreateAdd(Expr,
-                                   MCConstantExpr::Create(MO.getOffset(),
+    Expr = MCBinaryExpr::createAdd(Expr,
+                                   MCConstantExpr::create(MO.getOffset(),
                                                           OutContext),
                                    OutContext);
   return MCOperand::createExpr(Expr);
@@ -80,7 +80,7 @@ bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO,
     MCOp = MCOperand::createImm(MO.getImm());
     break;
   case MachineOperand::MO_MachineBasicBlock:
-    MCOp = MCOperand::createExpr(MCSymbolRefExpr::Create(
+    MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
         MO.getMBB()->getSymbol(), OutContext));
     break;
   case MachineOperand::MO_GlobalAddress: {
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index e794fb71af63..0aceaed87510 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -304,10 +304,6 @@ public:
     return getTM<ARMBaseTargetMachine>();
   }
 
-  const ARMSubtarget &getARMSubtarget() const {
-    return *getARMTargetMachine().getSubtargetImpl();
-  }
-
   void addIRPasses() override;
   bool addPreISel() override;
   bool addInstSelector() override;
@@ -330,24 +326,28 @@ void ARMPassConfig::addIRPasses() {
   // Cmpxchg instructions are often used with a subsequent comparison to
   // determine whether it succeeded. We can exploit existing control-flow in
   // ldrex/strex loops to simplify this, but it needs tidying up.
-  const ARMSubtarget *Subtarget = &getARMSubtarget();
-  if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only())
-    if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
-      addPass(createCFGSimplificationPass());
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
+    addPass(createCFGSimplificationPass(-1, [this](const Function &F) {
+      const auto &ST = this->TM->getSubtarget<ARMSubtarget>(F);
+      return ST.hasAnyDataBarrier() && !ST.isThumb1Only();
+    }));
 
   TargetPassConfig::addIRPasses();
 }
 
 bool ARMPassConfig::addPreISel() {
-  if ((TM->getOptLevel() == CodeGenOpt::Aggressive &&
+  if ((TM->getOptLevel() != CodeGenOpt::None &&
        EnableGlobalMerge == cl::BOU_UNSET) ||
-      EnableGlobalMerge == cl::BOU_TRUE)
+      EnableGlobalMerge == cl::BOU_TRUE) {
     // FIXME: This is using the thumb1 only constant value for
     // maximal global offset for merging globals. We may want
     // to look into using the old value for non-thumb1 code of
     // 4095 based on the TargetMachine, but this starts to become
     // tricky when doing code gen per function.
-    addPass(createGlobalMergePass(TM, 127));
+    bool OnlyOptimizeForSize = (TM->getOptLevel() < CodeGenOpt::Aggressive) &&
+                               (EnableGlobalMerge == cl::BOU_UNSET);
+    addPass(createGlobalMergePass(TM, 127, OnlyOptimizeForSize));
+  }
 
   return false;
 }
@@ -387,10 +387,13 @@ void ARMPassConfig::addPreSched2() {
 
   if (getOptLevel() != CodeGenOpt::None) {
     // in v8, IfConversion depends on Thumb instruction widths
-    if (getARMSubtarget().restrictIT())
-      addPass(createThumb2SizeReductionPass());
-    if (!getARMSubtarget().isThumb1Only())
-      addPass(&IfConverterID);
+    addPass(createThumb2SizeReductionPass([this](const Function &F) {
+      return this->TM->getSubtarget<ARMSubtarget>(F).restrictIT();
+    }));
+
+    addPass(createIfConverter([this](const Function &F) {
+      return !this->TM->getSubtarget<ARMSubtarget>(F).isThumb1Only();
+    }));
   }
   addPass(createThumb2ITBlockPass());
 }
@@ -399,8 +402,9 @@ void ARMPassConfig::addPreEmitPass() {
   addPass(createThumb2SizeReductionPass());
 
   // Constant island pass work on unbundled instructions.
-  if (getARMSubtarget().isThumb2())
-    addPass(&UnpackMachineBundlesID);
+  addPass(createUnpackMachineBundles([this](const Function &F) {
+    return this->TM->getSubtarget<ARMSubtarget>(F).isThumb2();
+  }));
 
   // Don't optimize barriers at -O0.
   if (getOptLevel() != CodeGenOpt::None)
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index 80f03c62bbfb..eaed5cc68750 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -50,12 +50,12 @@ const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference(
 
   assert(Encoding == DW_EH_PE_absptr && "Can handle absptr encoding only");
 
-  return MCSymbolRefExpr::Create(TM.getSymbol(GV, Mang),
+  return MCSymbolRefExpr::create(TM.getSymbol(GV, Mang),
                                  MCSymbolRefExpr::VK_ARM_TARGET2, getContext());
 }
 
 const MCExpr *ARMElfTargetObjectFile::
 getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
-  return MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_ARM_TLSLDO,
+  return MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_ARM_TLSLDO,
                                  getContext());
 }
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 30c7d62e84b8..8bcbb1159f81 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -1051,7 +1051,7 @@ public:
     if (!CE) return false;
     int64_t Value = CE->getValue();
     return (ARM_AM::getSOImmVal(Value) != -1 ||
-            ARM_AM::getSOImmVal(-Value) != -1);;
+            ARM_AM::getSOImmVal(-Value) != -1);
   }
   bool isT2SOImm() const {
     if (!isImm()) return false;
@@ -4252,7 +4252,7 @@ ARMAsmParser::parseSetEndImm(OperandVector &Operands) {
     Error(S, "'be' or 'le' operand expected");
     return MatchOperand_ParseFail;
   }
-  Operands.push_back(ARMOperand::CreateImm(MCConstantExpr::Create(Val,
+  Operands.push_back(ARMOperand::CreateImm(MCConstantExpr::create(Val,
                                                                   getContext()),
                                            S, Tok.getEndLoc()));
   return MatchOperand_Success;
@@ -4656,7 +4656,7 @@ ARMAsmParser::parseAM3Offset(OperandVector &Operands) {
       Val = INT32_MIN;
 
     Operands.push_back(
-      ARMOperand::CreateImm(MCConstantExpr::Create(Val, getContext()), S, E));
+      ARMOperand::CreateImm(MCConstantExpr::create(Val, getContext()), S, E));
 
     return MatchOperand_Success;
   }
@@ -4886,7 +4886,7 @@ bool ARMAsmParser::parseMemory(OperandVector &Operands) {
     // If the constant was #-0, represent it as INT32_MIN.
     int32_t Val = CE->getValue();
     if (isNegative && Val == 0)
-      CE = MCConstantExpr::Create(INT32_MIN, getContext());
+      CE = MCConstantExpr::create(INT32_MIN, getContext());
 
     // Now we should have the closing ']'
     if (Parser.getTok().isNot(AsmToken::RBrac))
@@ -5073,7 +5073,7 @@ ARMAsmParser::parseFPImm(OperandVector &Operands) {
     IntVal ^= (uint64_t)isNegative << 31;
     Parser.Lex(); // Eat the token.
     Operands.push_back(ARMOperand::CreateImm(
-          MCConstantExpr::Create(IntVal, getContext()),
+          MCConstantExpr::create(IntVal, getContext()),
           S, Parser.getTok().getLoc()));
     return MatchOperand_Success;
   }
@@ -5090,7 +5090,7 @@ ARMAsmParser::parseFPImm(OperandVector &Operands) {
     Val = APFloat(RealVal).bitcastToAPInt().getZExtValue();
 
     Operands.push_back(ARMOperand::CreateImm(
-        MCConstantExpr::Create(Val, getContext()), S,
+        MCConstantExpr::create(Val, getContext()), S,
         Parser.getTok().getLoc()));
     return MatchOperand_Success;
   }
@@ -5179,7 +5179,7 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
       if (CE) {
         int32_t Val = CE->getValue();
         if (isNegative && Val == 0)
-          ImmVal = MCConstantExpr::Create(INT32_MIN, getContext());
+          ImmVal = MCConstantExpr::create(INT32_MIN, getContext());
       }
       E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
       Operands.push_back(ARMOperand::CreateImm(ImmVal, S, E));
@@ -5209,7 +5209,7 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
     if (getParser().parseExpression(SubExprVal))
       return true;
 
-    const MCExpr *ExprVal = ARMMCExpr::Create(RefKind, SubExprVal,
+    const MCExpr *ExprVal = ARMMCExpr::create(RefKind, SubExprVal,
                                               getContext());
     E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
     Operands.push_back(ARMOperand::CreateImm(ExprVal, S, E));
@@ -5765,7 +5765,7 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // Add the processor imod operand, if necessary.
   if (ProcessorIMod) {
     Operands.push_back(ARMOperand::CreateImm(
-          MCConstantExpr::Create(ProcessorIMod, getContext()),
+          MCConstantExpr::create(ProcessorIMod, getContext()),
                                  NameLoc, NameLoc));
   } else if (Mnemonic == "cps" && isMClass()) {
     return Error(NameLoc, "instruction 'cps' requires effect for M-class");
@@ -6752,13 +6752,13 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
       MCSymbol *Dot = getContext().createTempSymbol();
       Out.EmitLabel(Dot);
       const MCExpr *OpExpr = Inst.getOperand(2).getExpr();
-      const MCExpr *InstPC = MCSymbolRefExpr::Create(Dot,
+      const MCExpr *InstPC = MCSymbolRefExpr::create(Dot,
                                                      MCSymbolRefExpr::VK_None,
                                                      getContext());
-      const MCExpr *Const8 = MCConstantExpr::Create(8, getContext());
-      const MCExpr *ReadPC = MCBinaryExpr::CreateAdd(InstPC, Const8,
+      const MCExpr *Const8 = MCConstantExpr::create(8, getContext());
+      const MCExpr *ReadPC = MCBinaryExpr::createAdd(InstPC, Const8,
                                                      getContext());
-      const MCExpr *FixupAddr = MCBinaryExpr::CreateAdd(ReadPC, OpExpr,
+      const MCExpr *FixupAddr = MCBinaryExpr::createAdd(ReadPC, OpExpr,
                                                         getContext());
       TmpInst.addOperand(MCOperand::createExpr(FixupAddr));
     }
@@ -9168,74 +9168,19 @@ bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
   StringRef CPU = getParser().parseStringToEndOfStatement().trim();
   getTargetStreamer().emitTextAttribute(ARMBuildAttrs::CPU_name, CPU);
 
+  // FIXME: This is using table-gen data, but should be moved to
+  // ARMTargetParser once that is table-gen'd.
   if (!STI.isCPUStringValid(CPU)) {
     Error(L, "Unknown CPU name");
     return false;
   }
 
-  // FIXME: This switches the CPU features globally, therefore it might
-  // happen that code you would not expect to assemble will. For details
-  // see: http://llvm.org/bugs/show_bug.cgi?id=20757
   STI.InitMCProcessorInfo(CPU, "");
   STI.InitCPUSchedModel(CPU);
   setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
 
   return false;
 }
-
-// FIXME: This is duplicated in getARMFPUFeatures() in
-// tools/clang/lib/Driver/Tools.cpp
-static const struct {
-  const unsigned ID;
-  const FeatureBitset Enabled;
-  const FeatureBitset Disabled;
-} FPUs[] = {
-    {/* ID */ ARM::FK_VFP, 
-     /* Enabled */ {ARM::FeatureVFP2}, 
-     /* Disabled */ {ARM::FeatureNEON}},
-    {/* ID */ ARM::FK_VFPV2, 
-     /* Enabled */ {ARM::FeatureVFP2}, 
-     /* Disabled */ {ARM::FeatureNEON}},
-    {/* ID */ ARM::FK_VFPV3, 
-     /* Enabled */ {ARM::FeatureVFP2, ARM::FeatureVFP3},  
-     /* Disabled */ {ARM::FeatureNEON, ARM::FeatureD16}},
-    {/* ID */ ARM::FK_VFPV3_D16, 
-     /* Enabled */ {ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureD16},
-     /* Disabled */ {ARM::FeatureNEON}},
-    {/* ID */ ARM::FK_VFPV4, 
-     /* Enabled */ {ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureVFP4},
-     /* Disabled */ {ARM::FeatureNEON, ARM::FeatureD16}},
-    {/* ID */ ARM::FK_VFPV4_D16, 
-     /* Enabled */ {ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureVFP4,
-                    ARM::FeatureD16},
-     /* Disabled */ {ARM::FeatureNEON}},
-    {/* ID */ ARM::FK_FPV5_D16, 
-     /* Enabled */ {ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureVFP4,
-                    ARM::FeatureFPARMv8, ARM::FeatureD16},
-     /* Disabled */ {ARM::FeatureNEON, ARM::FeatureCrypto}},
-    {/* ID */ ARM::FK_FP_ARMV8, 
-     /* Enabled */ {ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureVFP4,
-                    ARM::FeatureFPARMv8},
-     /* Disabled */ {ARM::FeatureNEON, ARM::FeatureCrypto, ARM::FeatureD16}},
-    {/* ID */ ARM::FK_NEON, 
-     /* Enabled */ {ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureNEON}, 
-     /* Disabled */ {ARM::FeatureD16}},
-    {/* ID */ ARM::FK_NEON_VFPV4, 
-     /* Enabled */ {ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureVFP4,
-                    ARM::FeatureNEON}, 
-     /* Disabled */ {ARM::FeatureD16}},
-    {/* ID */ ARM::FK_NEON_FP_ARMV8, 
-     /* Enabled */ {ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureVFP4,
-                    ARM::FeatureFPARMv8, ARM::FeatureNEON},
-     /* Disabled */ {ARM::FeatureCrypto, ARM::FeatureD16}},
-    {/* ID */ ARM::FK_CRYPTO_NEON_FP_ARMV8,
-     /* Enabled */ {ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureVFP4,
-                    ARM::FeatureFPARMv8, ARM::FeatureNEON, 
-                    ARM::FeatureCrypto},
-     /* Disabled */ {ARM::FeatureD16}},
-    {ARM::FK_SOFTVFP, {}, {}},
-};
-
 /// parseDirectiveFPU
 ///  ::= .fpu str
 bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
@@ -9243,23 +9188,15 @@ bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
   StringRef FPU = getParser().parseStringToEndOfStatement().trim();
 
   unsigned ID = ARMTargetParser::parseFPU(FPU);
-
-  if (ID == ARM::FK_INVALID) {
+  std::vector<const char *> Features;
+  if (!ARMTargetParser::getFPUFeatures(ID, Features)) {
     Error(FPUNameLoc, "Unknown FPU name");
     return false;
   }
 
-  for (const auto &Entry : FPUs) {
-    if (Entry.ID != ID)
-      continue;
-
-    // Need to toggle features that should be on but are off and that
-    // should off but are on.
-    FeatureBitset Toggle = (Entry.Enabled & ~STI.getFeatureBits()) |
-                           (Entry.Disabled & STI.getFeatureBits());
-    setAvailableFeatures(ComputeAvailableFeatures(STI.ToggleFeature(Toggle)));
-    break;
-  }
+  for (auto Feature : Features)
+    STI.ApplyFeatureFlag(Feature);
+  setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
 
   getTargetStreamer().emitFPU(ID);
   return false;
@@ -9804,7 +9741,7 @@ bool ARMAsmParser::parseDirectiveTLSDescSeq(SMLoc L) {
   }
 
   const MCSymbolRefExpr *SRE =
-    MCSymbolRefExpr::Create(Parser.getTok().getIdentifier(),
+    MCSymbolRefExpr::create(Parser.getTok().getIdentifier(),
                             MCSymbolRefExpr::VK_ARM_TLSDESCSEQ, getContext());
   Lex();
 
@@ -9982,33 +9919,32 @@ extern "C" void LLVMInitializeARMAsmParser() {
 #define GET_MATCHER_IMPLEMENTATION
 #include "ARMGenAsmMatcher.inc"
 
+// FIXME: This structure should be moved inside ARMTargetParser
+// when we start to table-generate them, and we can use the ARM
+// flags below, that were generated by table-gen.
 static const struct {
-  const char *Name;
+  const ARM::ArchExtKind Kind;
   const unsigned ArchCheck;
   const FeatureBitset Features;
 } Extensions[] = {
-  { "crc", Feature_HasV8, {ARM::FeatureCRC} },
-  { "crypto",  Feature_HasV8,
+  { ARM::AEK_CRC, Feature_HasV8, {ARM::FeatureCRC} },
+  { ARM::AEK_CRYPTO,  Feature_HasV8,
     {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} },
-  { "fp", Feature_HasV8, {ARM::FeatureFPARMv8} },
-  { "idiv", Feature_HasV7 | Feature_IsNotMClass,
+  { ARM::AEK_FP, Feature_HasV8, {ARM::FeatureFPARMv8} },
+  { ARM::AEK_HWDIV, Feature_HasV7 | Feature_IsNotMClass,
     {ARM::FeatureHWDiv, ARM::FeatureHWDivARM} },
-  // FIXME: iWMMXT not supported
-  { "iwmmxt", Feature_None, {} },
-  // FIXME: iWMMXT2 not supported
-  { "iwmmxt2", Feature_None, {} },
-  // FIXME: Maverick not supported
-  { "maverick", Feature_None, {} },
-  { "mp", Feature_HasV7 | Feature_IsNotMClass, {ARM::FeatureMP} },
-  // FIXME: ARMv6-m OS Extensions feature not checked
-  { "os", Feature_None, {} },
+  { ARM::AEK_MP, Feature_HasV7 | Feature_IsNotMClass, {ARM::FeatureMP} },
+  { ARM::AEK_SIMD, Feature_HasV8, {ARM::FeatureNEON, ARM::FeatureFPARMv8} },
   // FIXME: Also available in ARMv6-K
-  { "sec", Feature_HasV7, {ARM::FeatureTrustZone} },
-  { "simd", Feature_HasV8, {ARM::FeatureNEON, ARM::FeatureFPARMv8} },
+  { ARM::AEK_SEC, Feature_HasV7, {ARM::FeatureTrustZone} },
   // FIXME: Only available in A-class, isel not predicated
-  { "virt", Feature_HasV7, {ARM::FeatureVirtualization} },
-  // FIXME: xscale not supported
-  { "xscale", Feature_None, {} },
+  { ARM::AEK_VIRT, Feature_HasV7, {ARM::FeatureVirtualization} },
+  // FIXME: Unsupported extensions.
+  { ARM::AEK_OS, Feature_None, {} },
+  { ARM::AEK_IWMMXT, Feature_None, {} },
+  { ARM::AEK_IWMMXT2, Feature_None, {} },
+  { ARM::AEK_MAVERICK, Feature_None, {} },
+  { ARM::AEK_XSCALE, Feature_None, {} },
 };
 
 /// parseDirectiveArchExtension
@@ -10031,9 +9967,12 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
     EnableFeature = false;
     Name = Name.substr(2);
   }
+  unsigned FeatureKind = ARMTargetParser::parseArchExt(Name);
+  if (FeatureKind == ARM::AEK_INVALID)
+    Error(ExtLoc, "unknown architectural extension: " + Name);
 
   for (const auto &Extension : Extensions) {
-    if (Extension.Name != Name)
+    if (Extension.Kind != FeatureKind)
       continue;
 
     if (Extension.Features.none())
@@ -10080,7 +10019,7 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
     if (Op.isImm()) {
       const MCExpr *SOExpr = Op.getImm();
       int64_t Value;
-      if (!SOExpr->EvaluateAsAbsolute(Value))
+      if (!SOExpr->evaluateAsAbsolute(Value))
         return Match_Success;
       assert((Value >= INT32_MIN && Value <= UINT32_MAX) &&
              "expression value must be representable in 32 bits");
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 2d36c3020016..0bff52141da5 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -329,7 +329,8 @@ void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     const MCExpr *Expr = Op.getExpr();
     switch (Expr->getKind()) {
     case MCExpr::Binary:
-      O << '#' << *Expr;
+      O << '#';
+      Expr->print(O, &MAI);
       break;
     case MCExpr::Constant: {
       // If a symbolic branch target was added as a constant expression then
@@ -337,8 +338,9 @@ void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
       // address.
       const MCConstantExpr *Constant = cast<MCConstantExpr>(Expr);
       int64_t TargetAddress;
-      if (!Constant->EvaluateAsAbsolute(TargetAddress)) {
-        O << '#' << *Expr;
+      if (!Constant->evaluateAsAbsolute(TargetAddress)) {
+        O << '#';
+        Expr->print(O, &MAI);
       } else {
         O << "0x";
         O.write_hex(static_cast<uint32_t>(TargetAddress));
@@ -348,7 +350,7 @@ void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     default:
       // FIXME: Should we always treat this as if it is a constant literal and
       // prefix it with '#'?
-      O << *Expr;
+      Expr->print(O, &MAI);
       break;
     }
   }
@@ -359,7 +361,7 @@ void ARMInstPrinter::printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum,
                                                raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(OpNum);
   if (MO1.isExpr()) {
-    O << *MO1.getExpr();
+    MO1.getExpr()->print(O, &MAI);
     return;
   }
 
@@ -1055,7 +1057,7 @@ void ARMInstPrinter::printAdrLabelOperand(const MCInst *MI, unsigned OpNum,
   const MCOperand &MO = MI->getOperand(OpNum);
 
   if (MO.isExpr()) {
-    O << *MO.getExpr();
+    MO.getExpr()->print(O, &MAI);
     return;
   }
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
index f0eed9b811d4..b03cada9a641 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -622,8 +622,6 @@ namespace ARM_AM {
     return Value;
   }
 
-  AMSubMode getLoadStoreMultipleSubMode(int Opcode);
-
   //===--------------------------------------------------------------------===//
   // Floating-point Immediates
   //
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 6c1f7891f58a..be23e9070103 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -260,9 +260,9 @@ bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
         hasNOP() ? Thumb2_16bitNopEncoding : Thumb1_16bitNopEncoding;
     uint64_t NumNops = Count / 2;
     for (uint64_t i = 0; i != NumNops; ++i)
-      OW->Write16(nopEncoding);
+      OW->write16(nopEncoding);
     if (Count & 1)
-      OW->Write8(0);
+      OW->write8(0);
     return true;
   }
   // ARM mode
@@ -270,21 +270,21 @@ bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
       hasNOP() ? ARMv6T2_NopEncoding : ARMv4_NopEncoding;
   uint64_t NumNops = Count / 4;
   for (uint64_t i = 0; i != NumNops; ++i)
-    OW->Write32(nopEncoding);
+    OW->write32(nopEncoding);
   // FIXME: should this function return false when unable to write exactly
   // 'Count' bytes with NOP encodings?
   switch (Count % 4) {
   default:
     break; // No leftover bytes to write
   case 1:
-    OW->Write8(0);
+    OW->write8(0);
     break;
   case 2:
-    OW->Write16(0);
+    OW->write16(0);
     break;
   case 3:
-    OW->Write16(0);
-    OW->Write8(0xa0);
+    OW->write16(0);
+    OW->write8(0xa0);
     break;
   }
 
@@ -601,8 +601,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
   // the offset when the destination has the same MCFragment.
   if (A && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
     const MCSymbol &Sym = A->getSymbol();
-    const MCSymbolData &SymData = Asm.getSymbolData(Sym);
-    IsResolved = (SymData.getFragment() == DF);
+    IsResolved = (Sym.getFragment() == DF);
   }
   // We must always generate a relocation for BL/BLX instructions if we have
   // a symbol to reference, as the linker relies on knowing the destination
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index f4fedeef650b..804d3534096a 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -37,7 +37,7 @@ namespace {
     unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                           bool IsPCRel) const override;
 
-    bool needsRelocateWithSymbol(const MCSymbolData &SD,
+    bool needsRelocateWithSymbol(const MCSymbol &Sym,
                                  unsigned Type) const override;
   };
 }
@@ -49,7 +49,7 @@ ARMELFObjectWriter::ARMELFObjectWriter(uint8_t OSABI)
 
 ARMELFObjectWriter::~ARMELFObjectWriter() {}
 
-bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbolData &SD,
+bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
                                                  unsigned Type) const {
   // FIXME: This is extremely conservative. This really needs to use a
   // whitelist with a clear explanation for why each realocation needs to
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 0eb5a8136e88..6e3af739eca2 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -22,9 +22,7 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCELF.h"
 #include "llvm/MC/MCELFStreamer.h"
-#include "llvm/MC/MCELFSymbolFlags.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
@@ -34,7 +32,7 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/ARMBuildAttributes.h"
 #include "llvm/Support/ARMEHABI.h"
@@ -216,7 +214,13 @@ ARMTargetAsmStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) {
 }
 
 void ARMTargetAsmStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
-  OS << "\t.thumb_set\t" << *Symbol << ", " << *Value << '\n';
+  const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo();
+
+  OS << "\t.thumb_set\t";
+  Symbol->print(OS, MAI);
+  OS << ", ";
+  Value->print(OS, MAI);
+  OS << '\n';
 }
 
 void ARMTargetAsmStreamer::emitInst(uint32_t Inst, char Suffix) {
@@ -562,17 +566,16 @@ private:
     MCSymbol *Start = getContext().createTempSymbol();
     EmitLabel(Start);
 
-    MCSymbol *Symbol =
-      getContext().getOrCreateSymbol(Name + "." +
-                                     Twine(MappingSymbolCounter++));
+    auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
+        Name + "." + Twine(MappingSymbolCounter++)));
 
-    MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
-    MCELF::SetType(SD, ELF::STT_NOTYPE);
-    MCELF::SetBinding(SD, ELF::STB_LOCAL);
-    SD.setExternal(false);
+    getAssembler().registerSymbol(*Symbol);
+    Symbol->setType(ELF::STT_NOTYPE);
+    Symbol->setBinding(ELF::STB_LOCAL);
+    Symbol->setExternal(false);
     AssignSection(Symbol, getCurrentSection().first);
 
-    const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
+    const MCExpr *Value = MCSymbolRefExpr::create(Start, getContext());
     Symbol->setVariableValue(Value);
   }
 
@@ -688,16 +691,16 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
   using namespace ARMBuildAttrs;
 
   setAttributeItem(CPU_name,
-                   ARMTargetParser::getArchDefaultCPUName(Arch),
+                   ARMTargetParser::getCPUAttr(Arch),
                    false);
 
   if (EmittedArch == ARM::AK_INVALID)
     setAttributeItem(CPU_arch,
-                     ARMTargetParser::getArchDefaultCPUArch(Arch),
+                     ARMTargetParser::getArchAttr(Arch),
                      false);
   else
     setAttributeItem(CPU_arch,
-                     ARMTargetParser::getArchDefaultCPUArch(EmittedArch),
+                     ARMTargetParser::getArchAttr(EmittedArch),
                      false);
 
   switch (Arch) {
@@ -813,6 +816,9 @@ void ARMTargetELFStreamer::emitFPUDefaultAttributes() {
                      /* OverwriteExisting= */ false);
     break;
 
+  // ABI_HardFP_use is handled in ARMAsmPrinter, so _SP_D16 is treated the same
+  // as _D16 here.
+  case ARM::FK_FPV4_SP_D16:
   case ARM::FK_VFPV4_D16:
     setAttributeItem(ARMBuildAttrs::FP_arch,
                      ARMBuildAttrs::AllowFPv4B,
@@ -827,6 +833,7 @@ void ARMTargetELFStreamer::emitFPUDefaultAttributes() {
 
   // FPV5_D16 is identical to FP_ARMV8 except for the number of D registers, so
   // uses the FP_ARMV8_D16 build attribute.
+  case ARM::FK_FPV5_SP_D16:
   case ARM::FK_FPV5_D16:
     setAttributeItem(ARMBuildAttrs::FP_arch,
                      ARMBuildAttrs::AllowFPARMv8B,
@@ -861,6 +868,7 @@ void ARMTargetELFStreamer::emitFPUDefaultAttributes() {
     break;
 
   case ARM::FK_SOFTVFP:
+  case ARM::FK_NONE:
     break;
 
   default:
@@ -972,9 +980,9 @@ void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) {
   if (!Streamer.IsThumb)
     return;
 
-  const MCSymbolData &SD = Streamer.getOrCreateSymbolData(Symbol);
-  unsigned Type = MCELF::GetType(SD);
-  if (Type == ELF_STT_Func || Type == ELF_STT_GnuIFunc)
+  Streamer.getAssembler().registerSymbol(*Symbol);
+  unsigned Type = cast<MCSymbolELF>(Symbol)->getType();
+  if (Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC)
     Streamer.EmitThumbFunc(Symbol);
 }
 
@@ -1024,7 +1032,7 @@ inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix,
   }
 
   // Get .ARM.extab or .ARM.exidx section
-  const MCSymbol *Group = FnSection.getGroup();
+  const MCSymbolELF *Group = FnSection.getGroup();
   if (Group)
     Flags |= ELF::SHF_GROUP;
   MCSectionELF *EHSection =
@@ -1095,7 +1103,7 @@ void ARMELFStreamer::emitFnEnd() {
     EmitPersonalityFixup(GetAEABIUnwindPersonalityName(PersonalityIndex));
 
   const MCSymbolRefExpr *FnStartRef =
-    MCSymbolRefExpr::Create(FnStart,
+    MCSymbolRefExpr::create(FnStart,
                             MCSymbolRefExpr::VK_ARM_PREL31,
                             getContext());
 
@@ -1106,7 +1114,7 @@ void ARMELFStreamer::emitFnEnd() {
   } else if (ExTab) {
     // Emit a reference to the unwind opcodes in the ".ARM.extab" section.
     const MCSymbolRefExpr *ExTabEntryRef =
-      MCSymbolRefExpr::Create(ExTab,
+      MCSymbolRefExpr::create(ExTab,
                               MCSymbolRefExpr::VK_ARM_PREL31,
                               getContext());
     EmitValue(ExTabEntryRef, 4);
@@ -1138,7 +1146,7 @@ void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; }
 void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
   const MCSymbol *PersonalitySym = getContext().getOrCreateSymbol(Name);
 
-  const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::Create(
+  const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::create(
       PersonalitySym, MCSymbolRefExpr::VK_ARM_NONE, getContext());
 
   visitUsedExpr(*PersonalityRef);
@@ -1186,7 +1194,7 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
   // Emit personality
   if (Personality) {
     const MCSymbolRefExpr *PersonalityRef =
-      MCSymbolRefExpr::Create(Personality,
+      MCSymbolRefExpr::create(Personality,
                               MCSymbolRefExpr::VK_ARM_PREL31,
                               getContext());
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index caa873622ae9..1ac08159bd3d 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -19,8 +19,7 @@ using namespace llvm;
 
 void ARMMCAsmInfoDarwin::anchor() { }
 
-ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(StringRef TT) {
-  Triple TheTriple(TT);
+ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(const Triple &TheTriple) {
   if ((TheTriple.getArch() == Triple::armeb) ||
       (TheTriple.getArch() == Triple::thumbeb))
     IsLittleEndian = false;
@@ -41,8 +40,7 @@ ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(StringRef TT) {
 
 void ARMELFMCAsmInfo::anchor() { }
 
-ARMELFMCAsmInfo::ARMELFMCAsmInfo(StringRef TT) {
-  Triple TheTriple(TT);
+ARMELFMCAsmInfo::ARMELFMCAsmInfo(const Triple &TheTriple) {
   if ((TheTriple.getArch() == Triple::armeb) ||
       (TheTriple.getArch() == Triple::thumbeb))
     IsLittleEndian = false;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
index 6cb471537f6e..99a5fff5ec27 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -19,18 +19,19 @@
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
+  class Triple;
 
   class ARMMCAsmInfoDarwin : public MCAsmInfoDarwin {
     virtual void anchor();
 
   public:
-    explicit ARMMCAsmInfoDarwin(StringRef TT);
+    explicit ARMMCAsmInfoDarwin(const Triple &TheTriple);
   };
 
   class ARMELFMCAsmInfo : public MCAsmInfoELF {
     void anchor() override;
   public:
-    explicit ARMELFMCAsmInfo(StringRef TT);
+    explicit ARMELFMCAsmInfo(const Triple &TT);
 
     void setUseIntegratedAssembler(bool Value) override;
   };
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
index 5b90de327418..2063ca6bdf3b 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
@@ -16,12 +16,12 @@ using namespace llvm;
 #define DEBUG_TYPE "armmcexpr"
 
 const ARMMCExpr*
-ARMMCExpr::Create(VariantKind Kind, const MCExpr *Expr,
+ARMMCExpr::create(VariantKind Kind, const MCExpr *Expr,
                        MCContext &Ctx) {
   return new (Ctx) ARMMCExpr(Kind, Expr);
 }
 
-void ARMMCExpr::PrintImpl(raw_ostream &OS) const {
+void ARMMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   switch (Kind) {
   default: llvm_unreachable("Invalid kind!");
   case VK_ARM_HI16: OS << ":upper16:"; break;
@@ -31,7 +31,7 @@ void ARMMCExpr::PrintImpl(raw_ostream &OS) const {
   const MCExpr *Expr = getSubExpr();
   if (Expr->getKind() != MCExpr::SymbolRef)
     OS << '(';
-  Expr->print(OS);
+  Expr->print(OS, MAI);
   if (Expr->getKind() != MCExpr::SymbolRef)
     OS << ')';
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
index a52abe7760d1..9146d4def75a 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
@@ -33,15 +33,15 @@ public:
   /// @name Construction
   /// @{
 
-  static const ARMMCExpr *Create(VariantKind Kind, const MCExpr *Expr,
+  static const ARMMCExpr *create(VariantKind Kind, const MCExpr *Expr,
                                       MCContext &Ctx);
 
-  static const ARMMCExpr *CreateUpper16(const MCExpr *Expr, MCContext &Ctx) {
-    return Create(VK_ARM_HI16, Expr, Ctx);
+  static const ARMMCExpr *createUpper16(const MCExpr *Expr, MCContext &Ctx) {
+    return create(VK_ARM_HI16, Expr, Ctx);
   }
 
-  static const ARMMCExpr *CreateLower16(const MCExpr *Expr, MCContext &Ctx) {
-    return Create(VK_ARM_LO16, Expr, Ctx);
+  static const ARMMCExpr *createLower16(const MCExpr *Expr, MCContext &Ctx) {
+    return create(VK_ARM_LO16, Expr, Ctx);
   }
 
   /// @}
@@ -56,15 +56,15 @@ public:
 
   /// @}
 
-  void PrintImpl(raw_ostream &OS) const override;
-  bool EvaluateAsRelocatableImpl(MCValue &Res,
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAsmLayout *Layout,
                                  const MCFixup *Fixup) const override {
     return false;
   }
   void visitUsedExpr(MCStreamer &Streamer) const override;
-  MCSection *FindAssociatedSection() const override {
-    return getSubExpr()->FindAssociatedSection();
+  MCSection *findAssociatedSection() const override {
+    return getSubExpr()->findAssociatedSection();
   }
 
   // There are no TLS ARMMCExprs at the moment.
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 30deba9a08c6..92c4d6a824ea 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -277,18 +277,17 @@ static MCRegisterInfo *createARMMCRegisterInfo(StringRef Triple) {
   return X;
 }
 
-static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
-  Triple TheTriple(TT);
-
+static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI,
+                                     const Triple &TheTriple) {
   MCAsmInfo *MAI;
   if (TheTriple.isOSDarwin() || TheTriple.isOSBinFormatMachO())
-    MAI = new ARMMCAsmInfoDarwin(TT);
+    MAI = new ARMMCAsmInfoDarwin(TheTriple);
   else if (TheTriple.isWindowsItaniumEnvironment())
     MAI = new ARMCOFFMCAsmInfoGNU();
   else if (TheTriple.isWindowsMSVCEnvironment())
     MAI = new ARMCOFFMCAsmInfoMicrosoft();
   else
-    MAI = new ARMELFMCAsmInfo(TT);
+    MAI = new ARMELFMCAsmInfo(TheTriple);
 
   unsigned Reg = MRI.getDwarfRegNum(ARM::SP, true);
   MAI->addInitialFrameState(MCCFIInstruction::createDefCfa(nullptr, Reg, 0));
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
index d4b00e6e4fb5..4468132588cf 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
@@ -26,9 +26,9 @@ public:
                                              unsigned VariantKind) override {
     switch(VariantKind) {
     case LLVMDisassembler_VariantKind_ARM_HI16:
-      return ARMMCExpr::CreateUpper16(SubExpr, Ctx);
+      return ARMMCExpr::createUpper16(SubExpr, Ctx);
     case LLVMDisassembler_VariantKind_ARM_LO16:
-      return ARMMCExpr::CreateLower16(SubExpr, Ctx);
+      return ARMMCExpr::createLower16(SubExpr, Ctx);
     default:
       return MCRelocationInfo::createExprForCAPIVariantKind(SubExpr,
                                                             VariantKind);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 9755330bf8c3..95d7ea7c04a3 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -17,7 +17,6 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCFixupKindInfo.h"
-#include "llvm/MC/MCMachOSymbolFlags.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCValue.h"
@@ -49,12 +48,10 @@ class ARMMachObjectWriter : public MCMachObjectTargetWriter {
                                 const MCSymbol &S, uint64_t FixedValue);
 
 public:
-  ARMMachObjectWriter(bool Is64Bit, uint32_t CPUType,
-                      uint32_t CPUSubtype)
-    : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
-                               /*UseAggressiveSymbolFolding=*/true) {}
+  ARMMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype)
+      : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {}
 
-  void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+  void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
                         const MCAsmLayout &Layout, const MCFragment *Fragment,
                         const MCFixup &Fixup, MCValue Target,
                         uint64_t &FixedValue) override;
@@ -152,23 +149,21 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
-  const MCSymbolData *A_SD = &Asm.getSymbolData(*A);
 
-  if (!A_SD->getFragment())
+  if (!A->getFragment())
     Asm.getContext().reportFatalError(Fixup.getLoc(),
                        "symbol '" + A->getName() +
                        "' can not be undefined in a subtraction expression");
 
   uint32_t Value = Writer->getSymbolAddress(*A, Layout);
   uint32_t Value2 = 0;
-  uint64_t SecAddr =
-      Writer->getSectionAddress(A_SD->getFragment()->getParent());
+  uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
   FixedValue += SecAddr;
 
   if (const MCSymbolRefExpr *B = Target.getSymB()) {
-    const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+    const MCSymbol *SB = &B->getSymbol();
 
-    if (!B_SD->getFragment())
+    if (!SB->getFragment())
       Asm.getContext().reportFatalError(Fixup.getLoc(),
                          "symbol '" + B->getSymbol().getName() +
                          "' can not be undefined in a subtraction expression");
@@ -176,7 +171,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
     // Select the appropriate difference relocation type.
     Type = MachO::ARM_RELOC_HALF_SECTDIFF;
     Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
-    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
+    FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
   }
 
   // Relocations are written out in reverse order, so the PAIR comes first.
@@ -255,24 +250,22 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
-  const MCSymbolData *A_SD = &Asm.getSymbolData(*A);
 
-  if (!A_SD->getFragment())
+  if (!A->getFragment())
     Asm.getContext().reportFatalError(Fixup.getLoc(),
                        "symbol '" + A->getName() +
                        "' can not be undefined in a subtraction expression");
 
   uint32_t Value = Writer->getSymbolAddress(*A, Layout);
-  uint64_t SecAddr =
-      Writer->getSectionAddress(A_SD->getFragment()->getParent());
+  uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
   FixedValue += SecAddr;
   uint32_t Value2 = 0;
 
   if (const MCSymbolRefExpr *B = Target.getSymB()) {
     assert(Type == MachO::ARM_RELOC_VANILLA && "invalid reloc for 2 symbols");
-    const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+    const MCSymbol *SB = &B->getSymbol();
 
-    if (!B_SD->getFragment())
+    if (!SB->getFragment())
       Asm.getContext().reportFatalError(Fixup.getLoc(),
                          "symbol '" + B->getSymbol().getName() +
                          "' can not be undefined in a subtraction expression");
@@ -280,7 +273,7 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
     // Select the appropriate difference relocation type.
     Type = MachO::ARM_RELOC_SECTDIFF;
     Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
-    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
+    FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
   }
 
   // Relocations are written out in reverse order, so the PAIR comes first.
@@ -344,7 +337,7 @@ bool ARMMachObjectWriter::requiresExternRelocation(MachObjectWriter *Writer,
   return false;
 }
 
-void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
+void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer,
                                            MCAssembler &Asm,
                                            const MCAsmLayout &Layout,
                                            const MCFragment *Fragment,
@@ -405,7 +398,7 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
     // Resolve constant variables.
     if (A->isVariable()) {
       int64_t Res;
-      if (A->getVariableValue()->EvaluateAsAbsolute(
+      if (A->getVariableValue()->evaluateAsAbsolute(
               Res, Layout, Writer->getSectionAddressMap())) {
         FixedValue = Res;
         return;
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index b62ae2e3429e..68736bc1decd 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -94,12 +94,12 @@ static void TrackDefUses(MachineInstr *MI,
 /// conservatively remove more kill flags than are necessary, but removing them
 /// is safer than incorrect kill flags remaining on instructions.
 static void ClearKillFlags(MachineInstr *MI, SmallSet<unsigned, 4> &Uses) {
-  for (MIOperands MO(MI); MO.isValid(); ++MO) {
-    if (!MO->isReg() || MO->isDef() || !MO->isKill())
+  for (MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg() || MO.isDef() || !MO.isKill())
       continue;
-    if (!Uses.count(MO->getReg()))
+    if (!Uses.count(MO.getReg()))
       continue;
-    MO->setIsKill(false);
+    MO.setIsKill(false);
   }
 }
 
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index 0ab1ff906c9a..d9ab824995c1 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -133,7 +133,7 @@ namespace {
   class Thumb2SizeReduce : public MachineFunctionPass {
   public:
     static char ID;
-    Thumb2SizeReduce();
+    Thumb2SizeReduce(std::function<bool(const Function &)> Ftor);
 
     const Thumb2InstrInfo *TII;
     const ARMSubtarget *STI;
@@ -198,11 +198,14 @@ namespace {
     };
 
     SmallVector<MBBInfo, 8> BlockInfo;
+
+    std::function<bool(const Function &)> PredicateFtor;
   };
   char Thumb2SizeReduce::ID = 0;
 }
 
-Thumb2SizeReduce::Thumb2SizeReduce() : MachineFunctionPass(ID) {
+Thumb2SizeReduce::Thumb2SizeReduce(std::function<bool(const Function &)> Ftor)
+    : MachineFunctionPass(ID), PredicateFtor(Ftor) {
   OptimizeSize = MinimizeSize = false;
   for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) {
     unsigned FromOpc = ReduceTable[i].WideOpc;
@@ -1000,6 +1003,9 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
 }
 
 bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
+  if (PredicateFtor && !PredicateFtor(*MF.getFunction()))
+    return false;
+
   STI = &static_cast<const ARMSubtarget &>(MF.getSubtarget());
   if (STI->isThumb1Only() || STI->prefers32BitThumb())
     return false;
@@ -1025,6 +1031,7 @@ bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
 
 /// createThumb2SizeReductionPass - Returns an instance of the Thumb2 size
 /// reduction pass.
-FunctionPass *llvm::createThumb2SizeReductionPass() {
-  return new Thumb2SizeReduce();
+FunctionPass *llvm::createThumb2SizeReductionPass(
+    std::function<bool(const Function &)> Ftor) {
+  return new Thumb2SizeReduce(Ftor);
 }
diff --git a/lib/Target/BPF/BPFAsmPrinter.cpp b/lib/Target/BPF/BPFAsmPrinter.cpp
index 32375968eac1..10ec6587550b 100644
--- a/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -83,5 +83,7 @@ void BPFAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
 // Force static initialization.
 extern "C" void LLVMInitializeBPFAsmPrinter() {
-  RegisterAsmPrinter<BPFAsmPrinter> X(TheBPFTarget);
+  RegisterAsmPrinter<BPFAsmPrinter> X(TheBPFleTarget);
+  RegisterAsmPrinter<BPFAsmPrinter> Y(TheBPFbeTarget);
+  RegisterAsmPrinter<BPFAsmPrinter> Z(TheBPFTarget);
 }
diff --git a/lib/Target/BPF/BPFMCInstLower.cpp b/lib/Target/BPF/BPFMCInstLower.cpp
index d608afb348cb..00bd8d9c090c 100644
--- a/lib/Target/BPF/BPFMCInstLower.cpp
+++ b/lib/Target/BPF/BPFMCInstLower.cpp
@@ -33,7 +33,7 @@ BPFMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
 MCOperand BPFMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                              MCSymbol *Sym) const {
 
-  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
 
   if (!MO.isJTI() && MO.getOffset())
     llvm_unreachable("unknown symbol op");
@@ -63,7 +63,7 @@ void BPFMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
       break;
     case MachineOperand::MO_MachineBasicBlock:
       MCOp = MCOperand::createExpr(
-          MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(), Ctx));
+          MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx));
       break;
     case MachineOperand::MO_RegisterMask:
       continue;
diff --git a/lib/Target/BPF/BPFTargetMachine.cpp b/lib/Target/BPF/BPFTargetMachine.cpp
index 9487427fef5e..3329d5f87409 100644
--- a/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/lib/Target/BPF/BPFTargetMachine.cpp
@@ -23,19 +23,24 @@ using namespace llvm;
 
 extern "C" void LLVMInitializeBPFTarget() {
   // Register the target.
-  RegisterTargetMachine<BPFTargetMachine> X(TheBPFTarget);
+  RegisterTargetMachine<BPFTargetMachine> X(TheBPFleTarget);
+  RegisterTargetMachine<BPFTargetMachine> Y(TheBPFbeTarget);
+  RegisterTargetMachine<BPFTargetMachine> Z(TheBPFTarget);
+}
+
+// DataLayout: little or big endian
+static std::string computeDataLayout(StringRef TT) {
+  if (Triple(TT).getArch() == Triple::bpfeb)
+    return "E-m:e-p:64:64-i64:64-n32:64-S128";
+  else
+    return "e-m:e-p:64:64-i64:64-n32:64-S128";
 }
 
-// DataLayout --> Little-endian, 64-bit pointer/ABI/alignment
-// The stack is always 8 byte aligned
-// On function prologue, the stack is created by decrementing
-// its pointer. Once decremented, all references are done with positive
-// offset from the stack/frame pointer.
 BPFTargetMachine::BPFTargetMachine(const Target &T, StringRef TT, StringRef CPU,
                                    StringRef FS, const TargetOptions &Options,
                                    Reloc::Model RM, CodeModel::Model CM,
                                    CodeGenOpt::Level OL)
-    : LLVMTargetMachine(T, "e-m:e-p:64:64-i64:64-n32:64-S128", TT, CPU, FS,
+    : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS,
                         Options, RM, CM, OL),
       TLOF(make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
diff --git a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index 48f34e484590..7b1d9259caf9 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -25,7 +25,10 @@ using namespace llvm;
 namespace {
 class BPFAsmBackend : public MCAsmBackend {
 public:
-  BPFAsmBackend() : MCAsmBackend() {}
+  bool IsLittleEndian;
+
+  BPFAsmBackend(bool IsLittleEndian)
+    : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {}
   ~BPFAsmBackend() override {}
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
@@ -54,7 +57,7 @@ bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
     return false;
 
   for (uint64_t i = 0; i < Count; i += 8)
-    OW->Write64(0x15000000);
+    OW->write64(0x15000000);
 
   return true;
 }
@@ -69,17 +72,28 @@ void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
   }
   assert(Fixup.getKind() == FK_PCRel_2);
   Value = (uint16_t)((Value - 8) / 8);
-  Data[Fixup.getOffset() + 2] = Value & 0xFF;
-  Data[Fixup.getOffset() + 3] = Value >> 8;
+  if (IsLittleEndian) {
+    Data[Fixup.getOffset() + 2] = Value & 0xFF;
+    Data[Fixup.getOffset() + 3] = Value >> 8;
+  } else {
+    Data[Fixup.getOffset() + 2] = Value >> 8;
+    Data[Fixup.getOffset() + 3] = Value & 0xFF;
+  }
 }
 
 MCObjectWriter *BPFAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
-  return createBPFELFObjectWriter(OS, 0);
+  return createBPFELFObjectWriter(OS, 0, IsLittleEndian);
 }
 }
 
 MCAsmBackend *llvm::createBPFAsmBackend(const Target &T,
                                         const MCRegisterInfo &MRI, StringRef TT,
                                         StringRef CPU) {
-  return new BPFAsmBackend();
+  return new BPFAsmBackend(/*IsLittleEndian=*/true);
+}
+
+MCAsmBackend *llvm::createBPFbeAsmBackend(const Target &T,
+                                          const MCRegisterInfo &MRI, StringRef TT,
+                                          StringRef CPU) {
+  return new BPFAsmBackend(/*IsLittleEndian=*/false);
 }
diff --git a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
index a5562c1a933e..05ba6183e322 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -47,7 +47,8 @@ unsigned BPFELFObjectWriter::GetRelocType(const MCValue &Target,
   }
 }
 
-MCObjectWriter *llvm::createBPFELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI) {
+MCObjectWriter *llvm::createBPFELFObjectWriter(raw_pwrite_stream &OS,
+                                               uint8_t OSABI, bool IsLittleEndian) {
   MCELFObjectTargetWriter *MOTW = new BPFELFObjectWriter(OSABI);
-  return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true);
+  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
 }
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
index ab61ae7ae662..d63bbf49294e 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -16,13 +16,18 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
 
 namespace llvm {
 class Target;
+class Triple;
 
 class BPFMCAsmInfo : public MCAsmInfo {
 public:
-  explicit BPFMCAsmInfo(StringRef TT) {
+  explicit BPFMCAsmInfo(const Triple &TT) {
+    if (TT.getArch() == Triple::bpfeb)
+      IsLittleEndian = false;
+
     PrivateGlobalPrefix = ".L";
     WeakRefDirective = "\t.weak\t";
 
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
index ba8a874e4966..dc4ede30f191 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
@@ -30,9 +30,11 @@ class BPFMCCodeEmitter : public MCCodeEmitter {
   BPFMCCodeEmitter(const BPFMCCodeEmitter &) = delete;
   void operator=(const BPFMCCodeEmitter &) = delete;
   const MCRegisterInfo &MRI;
+  bool IsLittleEndian;
 
 public:
-  BPFMCCodeEmitter(const MCRegisterInfo &mri) : MRI(mri) {}
+  BPFMCCodeEmitter(const MCRegisterInfo &mri, bool IsLittleEndian)
+    : MRI(mri), IsLittleEndian(IsLittleEndian) {}
 
   ~BPFMCCodeEmitter() {}
 
@@ -61,7 +63,13 @@ public:
 MCCodeEmitter *llvm::createBPFMCCodeEmitter(const MCInstrInfo &MCII,
                                             const MCRegisterInfo &MRI,
                                             MCContext &Ctx) {
-  return new BPFMCCodeEmitter(MRI);
+  return new BPFMCCodeEmitter(MRI, true);
+}
+
+MCCodeEmitter *llvm::createBPFbeMCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              MCContext &Ctx) {
+  return new BPFMCCodeEmitter(MRI, false);
 }
 
 unsigned BPFMCCodeEmitter::getMachineOpValue(const MCInst &MI,
@@ -91,59 +99,53 @@ unsigned BPFMCCodeEmitter::getMachineOpValue(const MCInst &MI,
   return 0;
 }
 
-// Emit one byte through output stream
-void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) {
-  OS << (char)C;
-  ++CurByte;
-}
-
-// Emit a series of bytes (little endian)
-void EmitLEConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
-                    raw_ostream &OS) {
-  assert(Size <= 8 && "size too big in emit constant");
-
-  for (unsigned i = 0; i != Size; ++i) {
-    EmitByte(Val & 255, CurByte, OS);
-    Val >>= 8;
-  }
-}
-
-// Emit a series of bytes (big endian)
-void EmitBEConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
-                    raw_ostream &OS) {
-  assert(Size <= 8 && "size too big in emit constant");
-
-  for (int i = (Size - 1) * 8; i >= 0; i -= 8)
-    EmitByte((Val >> i) & 255, CurByte, OS);
+static uint8_t SwapBits(uint8_t Val)
+{
+  return (Val & 0x0F) << 4 | (Val & 0xF0) >> 4;
 }
 
 void BPFMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                                          SmallVectorImpl<MCFixup> &Fixups,
                                          const MCSubtargetInfo &STI) const {
   unsigned Opcode = MI.getOpcode();
-  // Keep track of the current byte being emitted
-  unsigned CurByte = 0;
+  support::endian::Writer<support::little> LE(OS);
+  support::endian::Writer<support::big> BE(OS);
 
   if (Opcode == BPF::LD_imm64 || Opcode == BPF::LD_pseudo) {
     uint64_t Value = getBinaryCodeForInstr(MI, Fixups, STI);
-    EmitByte(Value >> 56, CurByte, OS);
-    EmitByte(((Value >> 48) & 0xff), CurByte, OS);
-    EmitLEConstant(0, 2, CurByte, OS);
-    EmitLEConstant(Value & 0xffffFFFF, 4, CurByte, OS);
+    LE.write<uint8_t>(Value >> 56);
+    if (IsLittleEndian)
+      LE.write<uint8_t>((Value >> 48) & 0xff);
+    else
+      LE.write<uint8_t>(SwapBits((Value >> 48) & 0xff));
+    LE.write<uint16_t>(0);
+    if (IsLittleEndian)
+      LE.write<uint32_t>(Value & 0xffffFFFF);
+    else
+      BE.write<uint32_t>(Value & 0xffffFFFF);
 
     const MCOperand &MO = MI.getOperand(1);
     uint64_t Imm = MO.isImm() ? MO.getImm() : 0;
-    EmitByte(0, CurByte, OS);
-    EmitByte(0, CurByte, OS);
-    EmitLEConstant(0, 2, CurByte, OS);
-    EmitLEConstant(Imm >> 32, 4, CurByte, OS);
+    LE.write<uint8_t>(0);
+    LE.write<uint8_t>(0);
+    LE.write<uint16_t>(0);
+    if (IsLittleEndian)
+      LE.write<uint32_t>(Imm >> 32);
+    else
+      BE.write<uint32_t>(Imm >> 32);
   } else {
     // Get instruction encoding and emit it
     uint64_t Value = getBinaryCodeForInstr(MI, Fixups, STI);
-    EmitByte(Value >> 56, CurByte, OS);
-    EmitByte((Value >> 48) & 0xff, CurByte, OS);
-    EmitLEConstant((Value >> 32) & 0xffff, 2, CurByte, OS);
-    EmitLEConstant(Value & 0xffffFFFF, 4, CurByte, OS);
+    LE.write<uint8_t>(Value >> 56);
+    if (IsLittleEndian) {
+      LE.write<uint8_t>((Value >> 48) & 0xff);
+      LE.write<uint16_t>((Value >> 32) & 0xffff);
+      LE.write<uint32_t>(Value & 0xffffFFFF);
+    } else {
+      LE.write<uint8_t>(SwapBits((Value >> 48) & 0xff));
+      BE.write<uint16_t>((Value >> 32) & 0xffff);
+      BE.write<uint32_t>(Value & 0xffffFFFF);
+    }
   }
 }
 
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index c4cf4b824508..7cedba90a746 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -79,32 +79,43 @@ static MCInstPrinter *createBPFMCInstPrinter(const Triple &T,
 }
 
 extern "C" void LLVMInitializeBPFTargetMC() {
-  // Register the MC asm info.
-  RegisterMCAsmInfo<BPFMCAsmInfo> X(TheBPFTarget);
+  for (Target *T : {&TheBPFleTarget, &TheBPFbeTarget, &TheBPFTarget}) {
+    // Register the MC asm info.
+    RegisterMCAsmInfo<BPFMCAsmInfo> X(*T);
 
-  // Register the MC codegen info.
-  TargetRegistry::RegisterMCCodeGenInfo(TheBPFTarget, createBPFMCCodeGenInfo);
+    // Register the MC codegen info.
+    TargetRegistry::RegisterMCCodeGenInfo(*T, createBPFMCCodeGenInfo);
 
-  // Register the MC instruction info.
-  TargetRegistry::RegisterMCInstrInfo(TheBPFTarget, createBPFMCInstrInfo);
+    // Register the MC instruction info.
+    TargetRegistry::RegisterMCInstrInfo(*T, createBPFMCInstrInfo);
 
-  // Register the MC register info.
-  TargetRegistry::RegisterMCRegInfo(TheBPFTarget, createBPFMCRegisterInfo);
+    // Register the MC register info.
+    TargetRegistry::RegisterMCRegInfo(*T, createBPFMCRegisterInfo);
 
-  // Register the MC subtarget info.
-  TargetRegistry::RegisterMCSubtargetInfo(TheBPFTarget,
-                                          createBPFMCSubtargetInfo);
+    // Register the MC subtarget info.
+    TargetRegistry::RegisterMCSubtargetInfo(*T,
+                                            createBPFMCSubtargetInfo);
 
-  // Register the MC code emitter
-  TargetRegistry::RegisterMCCodeEmitter(TheBPFTarget,
-                                        llvm::createBPFMCCodeEmitter);
+    // Register the object streamer
+    TargetRegistry::RegisterELFStreamer(*T, createBPFMCStreamer);
 
-  // Register the ASM Backend
-  TargetRegistry::RegisterMCAsmBackend(TheBPFTarget, createBPFAsmBackend);
+    // Register the MCInstPrinter.
+    TargetRegistry::RegisterMCInstPrinter(*T, createBPFMCInstPrinter);
+  }
 
-  // Register the object streamer
-  TargetRegistry::RegisterELFStreamer(TheBPFTarget, createBPFMCStreamer);
+  // Register the MC code emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheBPFleTarget, createBPFMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheBPFbeTarget, createBPFbeMCCodeEmitter);
 
-  // Register the MCInstPrinter.
-  TargetRegistry::RegisterMCInstPrinter(TheBPFTarget, createBPFMCInstPrinter);
+  // Register the ASM Backend
+  TargetRegistry::RegisterMCAsmBackend(TheBPFleTarget, createBPFAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheBPFbeTarget, createBPFbeAsmBackend);
+
+  if (sys::IsLittleEndianHost) {
+    TargetRegistry::RegisterMCCodeEmitter(TheBPFTarget, createBPFMCCodeEmitter);
+    TargetRegistry::RegisterMCAsmBackend(TheBPFTarget, createBPFAsmBackend);
+  } else {
+    TargetRegistry::RegisterMCCodeEmitter(TheBPFTarget, createBPFbeMCCodeEmitter);
+    TargetRegistry::RegisterMCAsmBackend(TheBPFTarget, createBPFbeAsmBackend);
+  }
 }
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
index ce08b7cf76e6..a9ba7d990e17 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
@@ -30,16 +30,24 @@ class StringRef;
 class raw_ostream;
 class raw_pwrite_stream;
 
+extern Target TheBPFleTarget;
+extern Target TheBPFbeTarget;
 extern Target TheBPFTarget;
 
 MCCodeEmitter *createBPFMCCodeEmitter(const MCInstrInfo &MCII,
                                       const MCRegisterInfo &MRI,
                                       MCContext &Ctx);
+MCCodeEmitter *createBPFbeMCCodeEmitter(const MCInstrInfo &MCII,
+                                        const MCRegisterInfo &MRI,
+                                        MCContext &Ctx);
 
 MCAsmBackend *createBPFAsmBackend(const Target &T, const MCRegisterInfo &MRI,
                                   StringRef TT, StringRef CPU);
+MCAsmBackend *createBPFbeAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                    StringRef TT, StringRef CPU);
 
-MCObjectWriter *createBPFELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI);
+MCObjectWriter *createBPFELFObjectWriter(raw_pwrite_stream &OS,
+                                         uint8_t OSABI, bool IsLittleEndian);
 }
 
 // Defines symbolic names for BPF registers.  This defines a mapping from
diff --git a/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp b/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
index 87716e6775cf..a16dbae867b2 100644
--- a/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
+++ b/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
@@ -11,8 +11,18 @@
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
-Target llvm::TheBPFTarget;
+namespace llvm {
+Target TheBPFleTarget;
+Target TheBPFbeTarget;
+Target TheBPFTarget;
+}
 
 extern "C" void LLVMInitializeBPFTargetInfo() {
-  RegisterTarget<Triple::bpf, /*HasJIT=*/true> X(TheBPFTarget, "bpf", "BPF");
+  TargetRegistry::RegisterTarget(TheBPFTarget, "bpf",
+                                 "BPF (host endian)",
+                                 [](Triple::ArchType) { return false; }, true);
+  RegisterTarget<Triple::bpfel, /*HasJIT=*/true> X(
+      TheBPFleTarget, "bpfel", "BPF (little endian)");
+  RegisterTarget<Triple::bpfeb, /*HasJIT=*/true> Y(
+      TheBPFbeTarget, "bpfeb", "BPF (big endian)");
 }
diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt
index 1805437b12f7..e6d0199952f4 100644
--- a/lib/Target/CMakeLists.txt
+++ b/lib/Target/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_library(LLVMTarget
   TargetLoweringObjectFile.cpp
   TargetMachine.cpp
   TargetMachineC.cpp
+  TargetRecip.cpp
   TargetSubtargetInfo.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index f1a7127e8fd9..b8377986ecc0 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -645,8 +645,7 @@ void CppWriter::printType(Type* Ty) {
     if (DefinedTypes.find(Ty) == DefinedTypes.end()) {
       std::string elemName(getCppName(ET));
       Out << "ArrayType* " << typeName << " = ArrayType::get("
-          << elemName
-          << ", " << utostr(AT->getNumElements()) << ");";
+          << elemName << ", " << AT->getNumElements() << ");";
       nl(Out);
     }
     break;
@@ -658,8 +657,7 @@ void CppWriter::printType(Type* Ty) {
     if (DefinedTypes.find(Ty) == DefinedTypes.end()) {
       std::string elemName(getCppName(ET));
       Out << "PointerType* " << typeName << " = PointerType::get("
-          << elemName
-          << ", " << utostr(PT->getAddressSpace()) << ");";
+          << elemName << ", " << PT->getAddressSpace() << ");";
       nl(Out);
     }
     break;
@@ -671,8 +669,7 @@ void CppWriter::printType(Type* Ty) {
     if (DefinedTypes.find(Ty) == DefinedTypes.end()) {
       std::string elemName(getCppName(ET));
       Out << "VectorType* " << typeName << " = VectorType::get("
-          << elemName
-          << ", " << utostr(PT->getNumElements()) << ");";
+          << elemName << ", " << PT->getNumElements() << ");";
       nl(Out);
     }
     break;
@@ -1029,7 +1026,7 @@ void CppWriter::printVariableHead(const GlobalVariable *GV) {
   }
   if (GV->getAlignment()) {
     printCppName(GV);
-    Out << "->setAlignment(" << utostr(GV->getAlignment()) << ");";
+    Out << "->setAlignment(" << GV->getAlignment() << ");";
     nl(Out);
   }
   if (GV->getVisibility() != GlobalValue::DefaultVisibility) {
diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index a60d1e471944..14f9d777580c 100644
--- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -7,9 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "Hexagon.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
+
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCExpr.h"
@@ -27,6 +29,7 @@
 #include <vector>
 
 using namespace llvm;
+using namespace Hexagon;
 
 #define DEBUG_TYPE "hexagon-disassembler"
 
@@ -37,9 +40,14 @@ namespace {
 /// \brief Hexagon disassembler for all Hexagon platforms.
 class HexagonDisassembler : public MCDisassembler {
 public:
+  std::unique_ptr<MCInst *> CurrentBundle;
   HexagonDisassembler(MCSubtargetInfo const &STI, MCContext &Ctx)
-      : MCDisassembler(STI, Ctx) {}
+      : MCDisassembler(STI, Ctx), CurrentBundle(new MCInst *) {}
 
+  DecodeStatus getSingleInstruction(MCInst &Instr, MCInst &MCB,
+                                    ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                    raw_ostream &VStream, raw_ostream &CStream,
+                                    bool &Complete) const;
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
                               raw_ostream &VStream,
@@ -48,37 +56,43 @@ public:
 }
 
 static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
-  uint64_t Address, const void *Decoder);
+                                               uint64_t Address,
+                                               const void *Decoder);
 static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
-  uint64_t Address, const void *Decoder);
+                                               uint64_t Address,
+                                               const void *Decoder);
 static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, void const *Decoder);
+                                                 uint64_t Address,
+                                                 void const *Decoder);
+
+static unsigned GetSubinstOpcode(unsigned IClass, unsigned inst, unsigned &op,
+                                 raw_ostream &os);
+static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst);
 
 static const uint16_t IntRegDecoderTable[] = {
-  Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
-  Hexagon::R5, Hexagon::R6, Hexagon::R7, Hexagon::R8, Hexagon::R9,
-  Hexagon::R10, Hexagon::R11, Hexagon::R12, Hexagon::R13, Hexagon::R14,
-  Hexagon::R15, Hexagon::R16, Hexagon::R17, Hexagon::R18, Hexagon::R19,
-  Hexagon::R20, Hexagon::R21, Hexagon::R22, Hexagon::R23, Hexagon::R24,
-  Hexagon::R25, Hexagon::R26, Hexagon::R27, Hexagon::R28, Hexagon::R29,
-  Hexagon::R30, Hexagon::R31 };
+    Hexagon::R0,  Hexagon::R1,  Hexagon::R2,  Hexagon::R3,  Hexagon::R4,
+    Hexagon::R5,  Hexagon::R6,  Hexagon::R7,  Hexagon::R8,  Hexagon::R9,
+    Hexagon::R10, Hexagon::R11, Hexagon::R12, Hexagon::R13, Hexagon::R14,
+    Hexagon::R15, Hexagon::R16, Hexagon::R17, Hexagon::R18, Hexagon::R19,
+    Hexagon::R20, Hexagon::R21, Hexagon::R22, Hexagon::R23, Hexagon::R24,
+    Hexagon::R25, Hexagon::R26, Hexagon::R27, Hexagon::R28, Hexagon::R29,
+    Hexagon::R30, Hexagon::R31};
 
-static const uint16_t PredRegDecoderTable[] = { Hexagon::P0, Hexagon::P1,
-Hexagon::P2, Hexagon::P3 };
+static const uint16_t PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1,
+                                               Hexagon::P2, Hexagon::P3};
 
 static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
-  const uint16_t Table[], size_t Size) {
+                                        const uint16_t Table[], size_t Size) {
   if (RegNo < Size) {
     Inst.addOperand(MCOperand::createReg(Table[RegNo]));
     return MCDisassembler::Success;
-  }
-  else
+  } else
     return MCDisassembler::Fail;
 }
 
 static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
-  uint64_t /*Address*/,
-  void const *Decoder) {
+                                               uint64_t /*Address*/,
+                                               void const *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -88,13 +102,13 @@ static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
-  uint64_t /*Address*/, const void *Decoder) {
+                                               uint64_t /*Address*/,
+                                               const void *Decoder) {
   static const uint16_t CtrlRegDecoderTable[] = {
-    Hexagon::SA0, Hexagon::LC0, Hexagon::SA1, Hexagon::LC1,
-    Hexagon::P3_0, Hexagon::NoRegister, Hexagon::C6, Hexagon::C7,
-    Hexagon::USR, Hexagon::PC, Hexagon::UGP, Hexagon::GP,
-    Hexagon::CS0, Hexagon::CS1, Hexagon::UPCL, Hexagon::UPCH
-  };
+      Hexagon::SA0,  Hexagon::LC0,        Hexagon::SA1,  Hexagon::LC1,
+      Hexagon::P3_0, Hexagon::NoRegister, Hexagon::C6,   Hexagon::C7,
+      Hexagon::USR,  Hexagon::PC,         Hexagon::UGP,  Hexagon::GP,
+      Hexagon::CS0,  Hexagon::CS1,        Hexagon::UPCL, Hexagon::UPCH};
 
   if (RegNo >= sizeof(CtrlRegDecoderTable) / sizeof(CtrlRegDecoderTable[0]))
     return MCDisassembler::Fail;
@@ -108,17 +122,15 @@ static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t /*Address*/, void const *Decoder) {
+                                                 uint64_t /*Address*/,
+                                                 void const *Decoder) {
   static const uint16_t CtrlReg64DecoderTable[] = {
-    Hexagon::C1_0, Hexagon::NoRegister,
-    Hexagon::C3_2, Hexagon::NoRegister,
-    Hexagon::NoRegister, Hexagon::NoRegister,
-    Hexagon::C7_6, Hexagon::NoRegister,
-    Hexagon::C9_8, Hexagon::NoRegister,
-    Hexagon::C11_10, Hexagon::NoRegister,
-    Hexagon::CS, Hexagon::NoRegister,
-    Hexagon::UPC, Hexagon::NoRegister
-  };
+      Hexagon::C1_0,       Hexagon::NoRegister, Hexagon::C3_2,
+      Hexagon::NoRegister, Hexagon::NoRegister, Hexagon::NoRegister,
+      Hexagon::C7_6,       Hexagon::NoRegister, Hexagon::C9_8,
+      Hexagon::NoRegister, Hexagon::C11_10,     Hexagon::NoRegister,
+      Hexagon::CS,         Hexagon::NoRegister, Hexagon::UPC,
+      Hexagon::NoRegister};
 
   if (RegNo >= sizeof(CtrlReg64DecoderTable) / sizeof(CtrlReg64DecoderTable[0]))
     return MCDisassembler::Fail;
@@ -132,7 +144,8 @@ static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
-  uint64_t /*Address*/, const void *Decoder) {
+                                               uint64_t /*Address*/,
+                                               const void *Decoder) {
   unsigned Register = 0;
   switch (RegNo) {
   case 0:
@@ -149,22 +162,21 @@ static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
-  uint64_t /*Address*/, const void *Decoder) {
+                                                  uint64_t /*Address*/,
+                                                  const void *Decoder) {
   static const uint16_t DoubleRegDecoderTable[] = {
-    Hexagon::D0, Hexagon::D1, Hexagon::D2, Hexagon::D3,
-    Hexagon::D4, Hexagon::D5, Hexagon::D6, Hexagon::D7,
-    Hexagon::D8, Hexagon::D9, Hexagon::D10, Hexagon::D11,
-    Hexagon::D12, Hexagon::D13, Hexagon::D14, Hexagon::D15
-  };
+      Hexagon::D0,  Hexagon::D1,  Hexagon::D2,  Hexagon::D3,
+      Hexagon::D4,  Hexagon::D5,  Hexagon::D6,  Hexagon::D7,
+      Hexagon::D8,  Hexagon::D9,  Hexagon::D10, Hexagon::D11,
+      Hexagon::D12, Hexagon::D13, Hexagon::D14, Hexagon::D15};
 
-  return (DecodeRegisterClass(Inst, RegNo >> 1,
-    DoubleRegDecoderTable,
-    sizeof (DoubleRegDecoderTable)));
+  return (DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable,
+                              sizeof(DoubleRegDecoderTable)));
 }
 
 static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
-  uint64_t /*Address*/,
-  void const *Decoder) {
+                                                uint64_t /*Address*/,
+                                                void const *Decoder) {
   if (RegNo > 3)
     return MCDisassembler::Fail;
 
@@ -191,17 +203,687 @@ DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                                                  uint64_t Address,
                                                  raw_ostream &os,
                                                  raw_ostream &cs) const {
-  Size = 4;
-  if (Bytes.size() < 4)
-    return MCDisassembler::Fail;
+  DecodeStatus Result = DecodeStatus::Success;
+  bool Complete = false;
+  Size = 0;
+
+  *CurrentBundle = &MI;
+  MI.setOpcode(Hexagon::BUNDLE);
+  MI.addOperand(MCOperand::createImm(0));
+  while (Result == Success && Complete == false) {
+    if (Bytes.size() < HEXAGON_INSTR_SIZE)
+      return MCDisassembler::Fail;
+    MCInst *Inst = new (getContext()) MCInst;
+    Result = getSingleInstruction(*Inst, MI, Bytes, Address, os, cs, Complete);
+    MI.addOperand(MCOperand::createInst(Inst));
+    Size += HEXAGON_INSTR_SIZE;
+    Bytes = Bytes.slice(HEXAGON_INSTR_SIZE);
+  }
+  return Result;
+}
+
+DecodeStatus HexagonDisassembler::getSingleInstruction(
+    MCInst &MI, MCInst &MCB, ArrayRef<uint8_t> Bytes, uint64_t Address,
+    raw_ostream &os, raw_ostream &cs, bool &Complete) const {
+  assert(Bytes.size() >= HEXAGON_INSTR_SIZE);
 
-  uint32_t insn =
+  uint32_t Instruction =
       llvm::support::endian::read<uint32_t, llvm::support::little,
                                   llvm::support::unaligned>(Bytes.data());
 
-  // Remove parse bits.
-  insn &= ~static_cast<uint32_t>(HexagonII::InstParseBits::INST_PARSE_MASK);
-  DecodeStatus Result = decodeInstruction(DecoderTable32, MI, insn, Address, this, STI);
-  HexagonMCInstrInfo::AppendImplicitOperands(MI);
+  auto BundleSize = HexagonMCInstrInfo::bundleSize(MCB);
+  if ((Instruction & HexagonII::INST_PARSE_MASK) ==
+      HexagonII::INST_PARSE_LOOP_END) {
+    if (BundleSize == 0)
+      HexagonMCInstrInfo::setInnerLoop(MCB);
+    else if (BundleSize == 1)
+      HexagonMCInstrInfo::setOuterLoop(MCB);
+    else
+      return DecodeStatus::Fail;
+  }
+
+  DecodeStatus Result = DecodeStatus::Success;
+  if ((Instruction & HexagonII::INST_PARSE_MASK) ==
+      HexagonII::INST_PARSE_DUPLEX) {
+    // Determine the instruction class of each instruction in the duplex.
+    unsigned duplexIClass, IClassLow, IClassHigh;
+
+    duplexIClass = ((Instruction >> 28) & 0xe) | ((Instruction >> 13) & 0x1);
+    switch (duplexIClass) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      IClassLow = HexagonII::HSIG_L1;
+      IClassHigh = HexagonII::HSIG_L1;
+      break;
+    case 1:
+      IClassLow = HexagonII::HSIG_L2;
+      IClassHigh = HexagonII::HSIG_L1;
+      break;
+    case 2:
+      IClassLow = HexagonII::HSIG_L2;
+      IClassHigh = HexagonII::HSIG_L2;
+      break;
+    case 3:
+      IClassLow = HexagonII::HSIG_A;
+      IClassHigh = HexagonII::HSIG_A;
+      break;
+    case 4:
+      IClassLow = HexagonII::HSIG_L1;
+      IClassHigh = HexagonII::HSIG_A;
+      break;
+    case 5:
+      IClassLow = HexagonII::HSIG_L2;
+      IClassHigh = HexagonII::HSIG_A;
+      break;
+    case 6:
+      IClassLow = HexagonII::HSIG_S1;
+      IClassHigh = HexagonII::HSIG_A;
+      break;
+    case 7:
+      IClassLow = HexagonII::HSIG_S2;
+      IClassHigh = HexagonII::HSIG_A;
+      break;
+    case 8:
+      IClassLow = HexagonII::HSIG_S1;
+      IClassHigh = HexagonII::HSIG_L1;
+      break;
+    case 9:
+      IClassLow = HexagonII::HSIG_S1;
+      IClassHigh = HexagonII::HSIG_L2;
+      break;
+    case 10:
+      IClassLow = HexagonII::HSIG_S1;
+      IClassHigh = HexagonII::HSIG_S1;
+      break;
+    case 11:
+      IClassLow = HexagonII::HSIG_S2;
+      IClassHigh = HexagonII::HSIG_S1;
+      break;
+    case 12:
+      IClassLow = HexagonII::HSIG_S2;
+      IClassHigh = HexagonII::HSIG_L1;
+      break;
+    case 13:
+      IClassLow = HexagonII::HSIG_S2;
+      IClassHigh = HexagonII::HSIG_L2;
+      break;
+    case 14:
+      IClassLow = HexagonII::HSIG_S2;
+      IClassHigh = HexagonII::HSIG_S2;
+      break;
+    }
+
+    // Set the MCInst to be a duplex instruction. Which one doesn't matter.
+    MI.setOpcode(Hexagon::DuplexIClass0);
+
+    // Decode each instruction in the duplex.
+    // Create an MCInst for each instruction.
+    unsigned instLow = Instruction & 0x1fff;
+    unsigned instHigh = (Instruction >> 16) & 0x1fff;
+    unsigned opLow;
+    if (GetSubinstOpcode(IClassLow, instLow, opLow, os) !=
+        MCDisassembler::Success)
+      return MCDisassembler::Fail;
+    unsigned opHigh;
+    if (GetSubinstOpcode(IClassHigh, instHigh, opHigh, os) !=
+        MCDisassembler::Success)
+      return MCDisassembler::Fail;
+    MCInst *MILow = new (getContext()) MCInst;
+    MILow->setOpcode(opLow);
+    MCInst *MIHigh = new (getContext()) MCInst;
+    MIHigh->setOpcode(opHigh);
+    AddSubinstOperands(MILow, opLow, instLow);
+    AddSubinstOperands(MIHigh, opHigh, instHigh);
+    // see ConvertToSubInst() in
+    // lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+
+    // Add the duplex instruction MCInsts as operands to the passed in MCInst.
+    MCOperand OPLow = MCOperand::createInst(MILow);
+    MCOperand OPHigh = MCOperand::createInst(MIHigh);
+    MI.addOperand(OPLow);
+    MI.addOperand(OPHigh);
+    Complete = true;
+  } else {
+    if ((Instruction & HexagonII::INST_PARSE_MASK) ==
+        HexagonII::INST_PARSE_PACKET_END)
+      Complete = true;
+    // Calling the auto-generated decoder function.
+    Result =
+        decodeInstruction(DecoderTable32, MI, Instruction, Address, this, STI);
+  }
+
   return Result;
 }
+
+// These values are from HexagonGenMCCodeEmitter.inc and HexagonIsetDx.td
+enum subInstBinaryValues {
+  V4_SA1_addi_BITS = 0x0000,
+  V4_SA1_addi_MASK = 0x1800,
+  V4_SA1_addrx_BITS = 0x1800,
+  V4_SA1_addrx_MASK = 0x1f00,
+  V4_SA1_addsp_BITS = 0x0c00,
+  V4_SA1_addsp_MASK = 0x1c00,
+  V4_SA1_and1_BITS = 0x1200,
+  V4_SA1_and1_MASK = 0x1f00,
+  V4_SA1_clrf_BITS = 0x1a70,
+  V4_SA1_clrf_MASK = 0x1e70,
+  V4_SA1_clrfnew_BITS = 0x1a50,
+  V4_SA1_clrfnew_MASK = 0x1e70,
+  V4_SA1_clrt_BITS = 0x1a60,
+  V4_SA1_clrt_MASK = 0x1e70,
+  V4_SA1_clrtnew_BITS = 0x1a40,
+  V4_SA1_clrtnew_MASK = 0x1e70,
+  V4_SA1_cmpeqi_BITS = 0x1900,
+  V4_SA1_cmpeqi_MASK = 0x1f00,
+  V4_SA1_combine0i_BITS = 0x1c00,
+  V4_SA1_combine0i_MASK = 0x1d18,
+  V4_SA1_combine1i_BITS = 0x1c08,
+  V4_SA1_combine1i_MASK = 0x1d18,
+  V4_SA1_combine2i_BITS = 0x1c10,
+  V4_SA1_combine2i_MASK = 0x1d18,
+  V4_SA1_combine3i_BITS = 0x1c18,
+  V4_SA1_combine3i_MASK = 0x1d18,
+  V4_SA1_combinerz_BITS = 0x1d08,
+  V4_SA1_combinerz_MASK = 0x1d08,
+  V4_SA1_combinezr_BITS = 0x1d00,
+  V4_SA1_combinezr_MASK = 0x1d08,
+  V4_SA1_dec_BITS = 0x1300,
+  V4_SA1_dec_MASK = 0x1f00,
+  V4_SA1_inc_BITS = 0x1100,
+  V4_SA1_inc_MASK = 0x1f00,
+  V4_SA1_seti_BITS = 0x0800,
+  V4_SA1_seti_MASK = 0x1c00,
+  V4_SA1_setin1_BITS = 0x1a00,
+  V4_SA1_setin1_MASK = 0x1e40,
+  V4_SA1_sxtb_BITS = 0x1500,
+  V4_SA1_sxtb_MASK = 0x1f00,
+  V4_SA1_sxth_BITS = 0x1400,
+  V4_SA1_sxth_MASK = 0x1f00,
+  V4_SA1_tfr_BITS = 0x1000,
+  V4_SA1_tfr_MASK = 0x1f00,
+  V4_SA1_zxtb_BITS = 0x1700,
+  V4_SA1_zxtb_MASK = 0x1f00,
+  V4_SA1_zxth_BITS = 0x1600,
+  V4_SA1_zxth_MASK = 0x1f00,
+  V4_SL1_loadri_io_BITS = 0x0000,
+  V4_SL1_loadri_io_MASK = 0x1000,
+  V4_SL1_loadrub_io_BITS = 0x1000,
+  V4_SL1_loadrub_io_MASK = 0x1000,
+  V4_SL2_deallocframe_BITS = 0x1f00,
+  V4_SL2_deallocframe_MASK = 0x1fc0,
+  V4_SL2_jumpr31_BITS = 0x1fc0,
+  V4_SL2_jumpr31_MASK = 0x1fc4,
+  V4_SL2_jumpr31_f_BITS = 0x1fc5,
+  V4_SL2_jumpr31_f_MASK = 0x1fc7,
+  V4_SL2_jumpr31_fnew_BITS = 0x1fc7,
+  V4_SL2_jumpr31_fnew_MASK = 0x1fc7,
+  V4_SL2_jumpr31_t_BITS = 0x1fc4,
+  V4_SL2_jumpr31_t_MASK = 0x1fc7,
+  V4_SL2_jumpr31_tnew_BITS = 0x1fc6,
+  V4_SL2_jumpr31_tnew_MASK = 0x1fc7,
+  V4_SL2_loadrb_io_BITS = 0x1000,
+  V4_SL2_loadrb_io_MASK = 0x1800,
+  V4_SL2_loadrd_sp_BITS = 0x1e00,
+  V4_SL2_loadrd_sp_MASK = 0x1f00,
+  V4_SL2_loadrh_io_BITS = 0x0000,
+  V4_SL2_loadrh_io_MASK = 0x1800,
+  V4_SL2_loadri_sp_BITS = 0x1c00,
+  V4_SL2_loadri_sp_MASK = 0x1e00,
+  V4_SL2_loadruh_io_BITS = 0x0800,
+  V4_SL2_loadruh_io_MASK = 0x1800,
+  V4_SL2_return_BITS = 0x1f40,
+  V4_SL2_return_MASK = 0x1fc4,
+  V4_SL2_return_f_BITS = 0x1f45,
+  V4_SL2_return_f_MASK = 0x1fc7,
+  V4_SL2_return_fnew_BITS = 0x1f47,
+  V4_SL2_return_fnew_MASK = 0x1fc7,
+  V4_SL2_return_t_BITS = 0x1f44,
+  V4_SL2_return_t_MASK = 0x1fc7,
+  V4_SL2_return_tnew_BITS = 0x1f46,
+  V4_SL2_return_tnew_MASK = 0x1fc7,
+  V4_SS1_storeb_io_BITS = 0x1000,
+  V4_SS1_storeb_io_MASK = 0x1000,
+  V4_SS1_storew_io_BITS = 0x0000,
+  V4_SS1_storew_io_MASK = 0x1000,
+  V4_SS2_allocframe_BITS = 0x1c00,
+  V4_SS2_allocframe_MASK = 0x1e00,
+  V4_SS2_storebi0_BITS = 0x1200,
+  V4_SS2_storebi0_MASK = 0x1f00,
+  V4_SS2_storebi1_BITS = 0x1300,
+  V4_SS2_storebi1_MASK = 0x1f00,
+  V4_SS2_stored_sp_BITS = 0x0a00,
+  V4_SS2_stored_sp_MASK = 0x1e00,
+  V4_SS2_storeh_io_BITS = 0x0000,
+  V4_SS2_storeh_io_MASK = 0x1800,
+  V4_SS2_storew_sp_BITS = 0x0800,
+  V4_SS2_storew_sp_MASK = 0x1e00,
+  V4_SS2_storewi0_BITS = 0x1000,
+  V4_SS2_storewi0_MASK = 0x1f00,
+  V4_SS2_storewi1_BITS = 0x1100,
+  V4_SS2_storewi1_MASK = 0x1f00
+};
+
+static unsigned GetSubinstOpcode(unsigned IClass, unsigned inst, unsigned &op,
+                                 raw_ostream &os) {
+  switch (IClass) {
+  case HexagonII::HSIG_L1:
+    if ((inst & V4_SL1_loadri_io_MASK) == V4_SL1_loadri_io_BITS)
+      op = Hexagon::V4_SL1_loadri_io;
+    else if ((inst & V4_SL1_loadrub_io_MASK) == V4_SL1_loadrub_io_BITS)
+      op = Hexagon::V4_SL1_loadrub_io;
+    else {
+      os << "<unknown subinstruction>";
+      return MCDisassembler::Fail;
+    }
+    break;
+  case HexagonII::HSIG_L2:
+    if ((inst & V4_SL2_deallocframe_MASK) == V4_SL2_deallocframe_BITS)
+      op = Hexagon::V4_SL2_deallocframe;
+    else if ((inst & V4_SL2_jumpr31_MASK) == V4_SL2_jumpr31_BITS)
+      op = Hexagon::V4_SL2_jumpr31;
+    else if ((inst & V4_SL2_jumpr31_f_MASK) == V4_SL2_jumpr31_f_BITS)
+      op = Hexagon::V4_SL2_jumpr31_f;
+    else if ((inst & V4_SL2_jumpr31_fnew_MASK) == V4_SL2_jumpr31_fnew_BITS)
+      op = Hexagon::V4_SL2_jumpr31_fnew;
+    else if ((inst & V4_SL2_jumpr31_t_MASK) == V4_SL2_jumpr31_t_BITS)
+      op = Hexagon::V4_SL2_jumpr31_t;
+    else if ((inst & V4_SL2_jumpr31_tnew_MASK) == V4_SL2_jumpr31_tnew_BITS)
+      op = Hexagon::V4_SL2_jumpr31_tnew;
+    else if ((inst & V4_SL2_loadrb_io_MASK) == V4_SL2_loadrb_io_BITS)
+      op = Hexagon::V4_SL2_loadrb_io;
+    else if ((inst & V4_SL2_loadrd_sp_MASK) == V4_SL2_loadrd_sp_BITS)
+      op = Hexagon::V4_SL2_loadrd_sp;
+    else if ((inst & V4_SL2_loadrh_io_MASK) == V4_SL2_loadrh_io_BITS)
+      op = Hexagon::V4_SL2_loadrh_io;
+    else if ((inst & V4_SL2_loadri_sp_MASK) == V4_SL2_loadri_sp_BITS)
+      op = Hexagon::V4_SL2_loadri_sp;
+    else if ((inst & V4_SL2_loadruh_io_MASK) == V4_SL2_loadruh_io_BITS)
+      op = Hexagon::V4_SL2_loadruh_io;
+    else if ((inst & V4_SL2_return_MASK) == V4_SL2_return_BITS)
+      op = Hexagon::V4_SL2_return;
+    else if ((inst & V4_SL2_return_f_MASK) == V4_SL2_return_f_BITS)
+      op = Hexagon::V4_SL2_return_f;
+    else if ((inst & V4_SL2_return_fnew_MASK) == V4_SL2_return_fnew_BITS)
+      op = Hexagon::V4_SL2_return_fnew;
+    else if ((inst & V4_SL2_return_t_MASK) == V4_SL2_return_t_BITS)
+      op = Hexagon::V4_SL2_return_t;
+    else if ((inst & V4_SL2_return_tnew_MASK) == V4_SL2_return_tnew_BITS)
+      op = Hexagon::V4_SL2_return_tnew;
+    else {
+      os << "<unknown subinstruction>";
+      return MCDisassembler::Fail;
+    }
+    break;
+  case HexagonII::HSIG_A:
+    if ((inst & V4_SA1_addi_MASK) == V4_SA1_addi_BITS)
+      op = Hexagon::V4_SA1_addi;
+    else if ((inst & V4_SA1_addrx_MASK) == V4_SA1_addrx_BITS)
+      op = Hexagon::V4_SA1_addrx;
+    else if ((inst & V4_SA1_addsp_MASK) == V4_SA1_addsp_BITS)
+      op = Hexagon::V4_SA1_addsp;
+    else if ((inst & V4_SA1_and1_MASK) == V4_SA1_and1_BITS)
+      op = Hexagon::V4_SA1_and1;
+    else if ((inst & V4_SA1_clrf_MASK) == V4_SA1_clrf_BITS)
+      op = Hexagon::V4_SA1_clrf;
+    else if ((inst & V4_SA1_clrfnew_MASK) == V4_SA1_clrfnew_BITS)
+      op = Hexagon::V4_SA1_clrfnew;
+    else if ((inst & V4_SA1_clrt_MASK) == V4_SA1_clrt_BITS)
+      op = Hexagon::V4_SA1_clrt;
+    else if ((inst & V4_SA1_clrtnew_MASK) == V4_SA1_clrtnew_BITS)
+      op = Hexagon::V4_SA1_clrtnew;
+    else if ((inst & V4_SA1_cmpeqi_MASK) == V4_SA1_cmpeqi_BITS)
+      op = Hexagon::V4_SA1_cmpeqi;
+    else if ((inst & V4_SA1_combine0i_MASK) == V4_SA1_combine0i_BITS)
+      op = Hexagon::V4_SA1_combine0i;
+    else if ((inst & V4_SA1_combine1i_MASK) == V4_SA1_combine1i_BITS)
+      op = Hexagon::V4_SA1_combine1i;
+    else if ((inst & V4_SA1_combine2i_MASK) == V4_SA1_combine2i_BITS)
+      op = Hexagon::V4_SA1_combine2i;
+    else if ((inst & V4_SA1_combine3i_MASK) == V4_SA1_combine3i_BITS)
+      op = Hexagon::V4_SA1_combine3i;
+    else if ((inst & V4_SA1_combinerz_MASK) == V4_SA1_combinerz_BITS)
+      op = Hexagon::V4_SA1_combinerz;
+    else if ((inst & V4_SA1_combinezr_MASK) == V4_SA1_combinezr_BITS)
+      op = Hexagon::V4_SA1_combinezr;
+    else if ((inst & V4_SA1_dec_MASK) == V4_SA1_dec_BITS)
+      op = Hexagon::V4_SA1_dec;
+    else if ((inst & V4_SA1_inc_MASK) == V4_SA1_inc_BITS)
+      op = Hexagon::V4_SA1_inc;
+    else if ((inst & V4_SA1_seti_MASK) == V4_SA1_seti_BITS)
+      op = Hexagon::V4_SA1_seti;
+    else if ((inst & V4_SA1_setin1_MASK) == V4_SA1_setin1_BITS)
+      op = Hexagon::V4_SA1_setin1;
+    else if ((inst & V4_SA1_sxtb_MASK) == V4_SA1_sxtb_BITS)
+      op = Hexagon::V4_SA1_sxtb;
+    else if ((inst & V4_SA1_sxth_MASK) == V4_SA1_sxth_BITS)
+      op = Hexagon::V4_SA1_sxth;
+    else if ((inst & V4_SA1_tfr_MASK) == V4_SA1_tfr_BITS)
+      op = Hexagon::V4_SA1_tfr;
+    else if ((inst & V4_SA1_zxtb_MASK) == V4_SA1_zxtb_BITS)
+      op = Hexagon::V4_SA1_zxtb;
+    else if ((inst & V4_SA1_zxth_MASK) == V4_SA1_zxth_BITS)
+      op = Hexagon::V4_SA1_zxth;
+    else {
+      os << "<unknown subinstruction>";
+      return MCDisassembler::Fail;
+    }
+    break;
+  case HexagonII::HSIG_S1:
+    if ((inst & V4_SS1_storeb_io_MASK) == V4_SS1_storeb_io_BITS)
+      op = Hexagon::V4_SS1_storeb_io;
+    else if ((inst & V4_SS1_storew_io_MASK) == V4_SS1_storew_io_BITS)
+      op = Hexagon::V4_SS1_storew_io;
+    else {
+      os << "<unknown subinstruction>";
+      return MCDisassembler::Fail;
+    }
+    break;
+  case HexagonII::HSIG_S2:
+    if ((inst & V4_SS2_allocframe_MASK) == V4_SS2_allocframe_BITS)
+      op = Hexagon::V4_SS2_allocframe;
+    else if ((inst & V4_SS2_storebi0_MASK) == V4_SS2_storebi0_BITS)
+      op = Hexagon::V4_SS2_storebi0;
+    else if ((inst & V4_SS2_storebi1_MASK) == V4_SS2_storebi1_BITS)
+      op = Hexagon::V4_SS2_storebi1;
+    else if ((inst & V4_SS2_stored_sp_MASK) == V4_SS2_stored_sp_BITS)
+      op = Hexagon::V4_SS2_stored_sp;
+    else if ((inst & V4_SS2_storeh_io_MASK) == V4_SS2_storeh_io_BITS)
+      op = Hexagon::V4_SS2_storeh_io;
+    else if ((inst & V4_SS2_storew_sp_MASK) == V4_SS2_storew_sp_BITS)
+      op = Hexagon::V4_SS2_storew_sp;
+    else if ((inst & V4_SS2_storewi0_MASK) == V4_SS2_storewi0_BITS)
+      op = Hexagon::V4_SS2_storewi0;
+    else if ((inst & V4_SS2_storewi1_MASK) == V4_SS2_storewi1_BITS)
+      op = Hexagon::V4_SS2_storewi1;
+    else {
+      os << "<unknown subinstruction>";
+      return MCDisassembler::Fail;
+    }
+    break;
+  default:
+    os << "<unknown>";
+    return MCDisassembler::Fail;
+  }
+  return MCDisassembler::Success;
+}
+
+static unsigned getRegFromSubinstEncoding(unsigned encoded_reg) {
+  if (encoded_reg < 8)
+    return Hexagon::R0 + encoded_reg;
+  else if (encoded_reg < 16)
+    return Hexagon::R0 + encoded_reg + 8;
+  return Hexagon::NoRegister;
+}
+
+static unsigned getDRegFromSubinstEncoding(unsigned encoded_dreg) {
+  if (encoded_dreg < 4)
+    return Hexagon::D0 + encoded_dreg;
+  else if (encoded_dreg < 8)
+    return Hexagon::D0 + encoded_dreg + 4;
+  return Hexagon::NoRegister;
+}
+
+static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) {
+  int64_t operand;
+  MCOperand Op;
+  switch (opcode) {
+  case Hexagon::V4_SL2_deallocframe:
+  case Hexagon::V4_SL2_jumpr31:
+  case Hexagon::V4_SL2_jumpr31_f:
+  case Hexagon::V4_SL2_jumpr31_fnew:
+  case Hexagon::V4_SL2_jumpr31_t:
+  case Hexagon::V4_SL2_jumpr31_tnew:
+  case Hexagon::V4_SL2_return:
+  case Hexagon::V4_SL2_return_f:
+  case Hexagon::V4_SL2_return_fnew:
+  case Hexagon::V4_SL2_return_t:
+  case Hexagon::V4_SL2_return_tnew:
+    // no operands for these instructions
+    break;
+  case Hexagon::V4_SS2_allocframe:
+    // u 8-4{5_3}
+    operand = ((inst & 0x1f0) >> 4) << 3;
+    Op = MCOperand::createImm(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SL1_loadri_io:
+    // Rd 3-0, Rs 7-4, u 11-8{4_2}
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = (inst & 0xf00) >> 6;
+    Op = MCOperand::createImm(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SL1_loadrub_io:
+    // Rd 3-0, Rs 7-4, u 11-8
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = (inst & 0xf00) >> 8;
+    Op = MCOperand::createImm(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SL2_loadrb_io:
+    // Rd 3-0, Rs 7-4, u 10-8
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = (inst & 0x700) >> 8;
+    Op = MCOperand::createImm(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SL2_loadrh_io:
+  case Hexagon::V4_SL2_loadruh_io:
+    // Rd 3-0, Rs 7-4, u 10-8{3_1}
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = ((inst & 0x700) >> 8) << 1;
+    Op = MCOperand::createImm(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SL2_loadrd_sp:
+    // Rdd 2-0, u 7-3{5_3}
+    operand = getDRegFromSubinstEncoding(inst & 0x7);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = ((inst & 0x0f8) >> 3) << 3;
+    Op = MCOperand::createImm(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SL2_loadri_sp:
+    // Rd 3-0, u 8-4{5_2}
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = ((inst & 0x1f0) >> 4) << 2;
+    Op = MCOperand::createImm(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SA1_addi:
+    // Rx 3-0 (x2), s7 10-4
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    MI->addOperand(Op);
+    operand = SignExtend64<7>((inst & 0x7f0) >> 4);
+    Op = MCOperand::createImm(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SA1_addrx:
+    // Rx 3-0 (x2), Rs 7-4
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    MI->addOperand(Op);
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+  case Hexagon::V4_SA1_and1:
+  case Hexagon::V4_SA1_dec:
+  case Hexagon::V4_SA1_inc:
+  case Hexagon::V4_SA1_sxtb:
+  case Hexagon::V4_SA1_sxth:
+  case Hexagon::V4_SA1_tfr:
+  case Hexagon::V4_SA1_zxtb:
+  case Hexagon::V4_SA1_zxth:
+    // Rd 3-0, Rs 7-4
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SA1_addsp:
+    // Rd 3-0, u 9-4{6_2}
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = ((inst & 0x3f0) >> 4) << 2;
+    Op = MCOperand::createImm(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SA1_seti:
+    // Rd 3-0, u 9-4
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = (inst & 0x3f0) >> 4;
+    Op = MCOperand::createImm(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SA1_clrf:
+  case Hexagon::V4_SA1_clrfnew:
+  case Hexagon::V4_SA1_clrt:
+  case Hexagon::V4_SA1_clrtnew:
+  case Hexagon::V4_SA1_setin1:
+    // Rd 3-0
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SA1_cmpeqi:
+    // Rs 7-4, u 1-0
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = inst & 0x3;
+    Op = MCOperand::createImm(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SA1_combine0i:
+  case Hexagon::V4_SA1_combine1i:
+  case Hexagon::V4_SA1_combine2i:
+  case Hexagon::V4_SA1_combine3i:
+    // Rdd 2-0, u 6-5
+    operand = getDRegFromSubinstEncoding(inst & 0x7);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = (inst & 0x060) >> 5;
+    Op = MCOperand::createImm(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SA1_combinerz:
+  case Hexagon::V4_SA1_combinezr:
+    // Rdd 2-0, Rs 7-4
+    operand = getDRegFromSubinstEncoding(inst & 0x7);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SS1_storeb_io:
+    // Rs 7-4, u 11-8, Rt 3-0
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = (inst & 0xf00) >> 8;
+    Op = MCOperand::createImm(operand);
+    MI->addOperand(Op);
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SS1_storew_io:
+    // Rs 7-4, u 11-8{4_2}, Rt 3-0
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = ((inst & 0xf00) >> 8) << 2;
+    Op = MCOperand::createImm(operand);
+    MI->addOperand(Op);
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SS2_storebi0:
+  case Hexagon::V4_SS2_storebi1:
+    // Rs 7-4, u 3-0
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = inst & 0xf;
+    Op = MCOperand::createImm(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SS2_storewi0:
+  case Hexagon::V4_SS2_storewi1:
+    // Rs 7-4, u 3-0{4_2}
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = (inst & 0xf) << 2;
+    Op = MCOperand::createImm(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SS2_stored_sp:
+    // s 8-3{6_3}, Rtt 2-0
+    operand = SignExtend64<9>(((inst & 0x1f8) >> 3) << 3);
+    Op = MCOperand::createImm(operand);
+    MI->addOperand(Op);
+    operand = getDRegFromSubinstEncoding(inst & 0x7);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+  case Hexagon::V4_SS2_storeh_io:
+    // Rs 7-4, u 10-8{3_1}, Rt 3-0
+    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    operand = ((inst & 0x700) >> 8) << 1;
+    Op = MCOperand::createImm(operand);
+    MI->addOperand(Op);
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    break;
+  case Hexagon::V4_SS2_storew_sp:
+    // u 8-4{5_2}, Rd 3-0
+    operand = ((inst & 0x1f0) >> 4) << 2;
+    Op = MCOperand::createImm(operand);
+    MI->addOperand(Op);
+    operand = getRegFromSubinstEncoding(inst & 0xf);
+    Op = MCOperand::createReg(operand);
+    MI->addOperand(Op);
+    break;
+  default:
+    // don't crash with an invalid subinstruction
+    // llvm_unreachable("Invalid subinstruction in duplex instruction");
+    break;
+  }
+}
diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h
index dfe79f9ff7b0..6e2ecaf57e49 100644
--- a/lib/Target/Hexagon/Hexagon.h
+++ b/lib/Target/Hexagon/Hexagon.h
@@ -76,4 +76,11 @@ namespace llvm {
 // Maximum number of words and instructions in a packet.
 #define HEXAGON_PACKET_SIZE 4
 
+// Minimum number of instructions in an end-loop packet.
+#define HEXAGON_PACKET_INNER_SIZE 2
+#define HEXAGON_PACKET_OUTER_SIZE 3
+// Maximum number of instructions in a packet before shuffling,
+// including a compound one or a duplex or an extender.
+#define HEXAGON_PRESHUFFLE_PACKET_SIZE (HEXAGON_PACKET_SIZE + 3)
+
 #endif
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index e9491baf29ef..05728d2b627e 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -20,6 +20,7 @@
 #include "HexagonTargetMachine.h"
 #include "MCTargetDesc/HexagonInstPrinter.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCShuffler.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -78,14 +79,14 @@ void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
     O << MO.getImm();
     return;
   case MachineOperand::MO_MachineBasicBlock:
-    O << *MO.getMBB()->getSymbol();
+    MO.getMBB()->getSymbol()->print(O, MAI);
     return;
   case MachineOperand::MO_ConstantPoolIndex:
-    O << *GetCPISymbol(MO.getIndex());
+    GetCPISymbol(MO.getIndex())->print(O, MAI);
     return;
   case MachineOperand::MO_GlobalAddress:
     // Computing the address of a global symbol, not calling it.
-    O << *getSymbol(MO.getGlobal());
+    getSymbol(MO.getGlobal())->print(O, MAI);
     printOffset(MO.getOffset(), O);
     return;
   }
@@ -177,49 +178,40 @@ bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 /// the current output stream.
 ///
 void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  if (MI->isBundle()) {
-    std::vector<MachineInstr const *> BundleMIs;
+  MCInst MCB;
+  MCB.setOpcode(Hexagon::BUNDLE);
+  MCB.addOperand(MCOperand::createImm(0));
 
-    const MachineBasicBlock *MBB = MI->getParent();
+  if (MI->isBundle()) {
+    const MachineBasicBlock* MBB = MI->getParent();
     MachineBasicBlock::const_instr_iterator MII = MI;
-    ++MII;
-    unsigned int IgnoreCount = 0;
-    while (MII != MBB->end() && MII->isInsideBundle()) {
-      const MachineInstr *MInst = MII;
-      if (MInst->getOpcode() == TargetOpcode::DBG_VALUE ||
-        MInst->getOpcode() == TargetOpcode::IMPLICIT_DEF) {
-        IgnoreCount++;
-        ++MII;
-        continue;
+    unsigned IgnoreCount = 0;
+
+    for (++MII; MII != MBB->end() && MII->isInsideBundle(); ++MII) {
+      if (MII->getOpcode() == TargetOpcode::DBG_VALUE ||
+          MII->getOpcode() == TargetOpcode::IMPLICIT_DEF)
+        ++IgnoreCount;
+      else {
+        HexagonLowerToMC(MII, MCB, *this);
       }
-      // BundleMIs.push_back(&*MII);
-      BundleMIs.push_back(MInst);
-      ++MII;
-    }
-    unsigned Size = BundleMIs.size();
-    assert((Size + IgnoreCount) == MI->getBundleSize() && "Corrupt Bundle!");
-    for (unsigned Index = 0; Index < Size; Index++) {
-      MCInst MCI;
-
-      HexagonLowerToMC(BundleMIs[Index], MCI, *this);
-      HexagonMCInstrInfo::AppendImplicitOperands(MCI);
-      HexagonMCInstrInfo::setPacketBegin(MCI, Index == 0);
-      HexagonMCInstrInfo::setPacketEnd(MCI, Index == (Size - 1));
-      EmitToStreamer(*OutStreamer, MCI);
     }
   }
   else {
-    MCInst MCI;
-    HexagonLowerToMC(MI, MCI, *this);
-    HexagonMCInstrInfo::AppendImplicitOperands(MCI);
-    if (MI->getOpcode() == Hexagon::ENDLOOP0) {
-      HexagonMCInstrInfo::setPacketBegin(MCI, true);
-      HexagonMCInstrInfo::setPacketEnd(MCI, true);
-    }
-    EmitToStreamer(*OutStreamer, MCI);
+    HexagonLowerToMC(MI, MCB, *this);
+    HexagonMCInstrInfo::padEndloop(MCB);
   }
-
-  return;
+  // Examine the packet and try to find instructions that can be converted
+  // to compounds.
+  HexagonMCInstrInfo::tryCompound(*Subtarget->getInstrInfo(),
+                                  OutStreamer->getContext(), MCB);
+  // Examine the packet and convert pairs of instructions to duplex
+  // instructions when possible.
+  SmallVector<DuplexCandidate, 8> possibleDuplexes;
+  possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties(
+      *Subtarget->getInstrInfo(), MCB);
+  HexagonMCShuffle(*Subtarget->getInstrInfo(), *Subtarget,
+                   OutStreamer->getContext(), MCB, possibleDuplexes);
+  EmitToStreamer(*OutStreamer, MCB);
 }
 
 extern "C" void LLVMInitializeHexagonAsmPrinter() {
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 0885a794a7b4..868f87e18413 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -201,17 +201,17 @@ namespace {
           break;
       }
       // Check individual operands.
-      for (ConstMIOperands Mo(MI); Mo.isValid(); ++Mo) {
+      for (const MachineOperand &MO : MI->operands()) {
         // While the presence of a frame index does not prove that a stack
         // frame will be required, all frame indexes should be within alloc-
         // frame/deallocframe. Otherwise, the code that translates a frame
         // index into an offset would have to be aware of the placement of
         // the frame creation/destruction instructions.
-        if (Mo->isFI())
+        if (MO.isFI())
           return true;
-        if (!Mo->isReg())
+        if (!MO.isReg())
           continue;
-        unsigned R = Mo->getReg();
+        unsigned R = MO.getReg();
         // Virtual registers will need scavenging, which then may require
         // a stack slot.
         if (TargetRegisterInfo::isVirtualRegister(R))
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index ed5676c1fbb6..74d92aef25ac 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -2370,7 +2370,8 @@ bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
 /// isLegalAddressingMode - Return true if the addressing mode represented by
 /// AM is legal for this target, for a load/store of the specified type.
 bool HexagonTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                  Type *Ty) const {
+                                                  Type *Ty,
+                                                  unsigned AS) const {
   // Allows a signed-extended 11-bit immediate field.
   if (AM.BaseOffs <= -(1LL << 13) || AM.BaseOffs >= (1LL << 13)-1)
     return false;
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 584c2c57c7ca..b80e8477eb7b 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -198,7 +198,8 @@ bool isPositiveHalfWord(SDNode *N);
     /// The type may be VoidTy, in which case only return true if the addressing
     /// mode is legal for a load/store of any legal type.
     /// TODO: Handle pre/postinc as well.
-    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
+                               unsigned AS) const override;
     bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index 36a7e9f642c6..44bab292f32c 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -66,10 +66,8 @@ def DoubleWordAccess : MemAccessSize<4>;// Double word access instruction (memd)
 class OpcodeHexagon {
   field bits<32> Inst = ?; // Default to an invalid insn.
   bits<4> IClass = 0; // ICLASS
-  bits<2> IParse = 0; // Parse bits.
 
   let Inst{31-28} = IClass;
-  let Inst{15-14} = IParse;
 
   bits<1> zero = 0;
 }
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
index 7f7b2c96dba7..db83ef6bc474 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV4.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
@@ -146,6 +146,11 @@ class EXTENDERInst<dag outs, dag ins, string asmstr, list<dag> pattern = []>
   : InstHexagon<outs, ins, asmstr, pattern, "", EXTENDER_tc_1_SLOT0123,
                 TypePREFIX>, OpcodeHexagon;
 
+class SUBInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, "", PREFIX, TypeDUPLEX>,
+    OpcodeHexagon;
+
 class CJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
               string cstr = "">
   : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND, TypeCOMPOUND>,
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 49b4517698d5..e566a97789a9 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -779,10 +779,9 @@ HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   return false;
 }
 
-MachineInstr *HexagonInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                                      MachineInstr *MI,
-                                                      ArrayRef<unsigned> Ops,
-                                                      int FI) const {
+MachineInstr *HexagonInstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, int FI) const {
   // Hexagon_TODO: Implement.
   return nullptr;
 }
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 0239cabe9e52..a7ae65e4eb9c 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -114,10 +114,12 @@ public:
 
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
                                       ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
                                       int FrameIndex) const override;
 
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
                                       ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
                                       MachineInstr *LoadMI) const override {
     return nullptr;
   }
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
index 8b667c645156..65b0f4974367 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -4263,3 +4263,7 @@ def J4_jumpsetr: CJInst <
     let Inst{19-16} = Rs;
     let Inst{7-1} = r9_2{8-2};
   }
+
+// Duplex instructions
+//===----------------------------------------------------------------------===//
+include "HexagonIsetDx.td"
diff --git a/lib/Target/Hexagon/HexagonIsetDx.td b/lib/Target/Hexagon/HexagonIsetDx.td
new file mode 100644
index 000000000000..0ca95e999859
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonIsetDx.td
@@ -0,0 +1,728 @@
+//=- HexagonIsetDx.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon duplex instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// SA1_combine1i: Combines.
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+def V4_SA1_combine1i: SUBInst <
+  (outs DoubleRegs:$Rdd),
+  (ins u2Imm:$u2),
+  "$Rdd = combine(#1, #$u2)"> {
+    bits<3> Rdd;
+    bits<2> u2;
+
+    let Inst{12-10} = 0b111;
+    let Inst{8} = 0b0;
+    let Inst{4-3} = 0b01;
+    let Inst{2-0} = Rdd;
+    let Inst{6-5} = u2;
+  }
+
+// SL2_jumpr31_f: Indirect conditional jump if false.
+// SL2_jumpr31_f -> SL2_jumpr31_fnew
+let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
+def V4_SL2_jumpr31_f: SUBInst <
+  (outs ),
+  (ins ),
+  "if (!p0) jumpr r31"> {
+    let Inst{12-6} = 0b1111111;
+    let Inst{2-0} = 0b101;
+  }
+
+// SL2_deallocframe: Deallocate stack frame.
+let Defs = [R31, R29, R30], Uses = [R30], isCodeGenOnly = 1, mayLoad = 1, accessSize = DoubleWordAccess in
+def V4_SL2_deallocframe: SUBInst <
+  (outs ),
+  (ins ),
+  "deallocframe"> {
+    let Inst{12-6} = 0b1111100;
+    let Inst{2} = 0b0;
+  }
+
+// SL2_return_f: Deallocate stack frame and return.
+// SL2_return_f -> SL2_return_fnew
+let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
+def V4_SL2_return_f: SUBInst <
+  (outs ),
+  (ins ),
+  "if (!p0) dealloc_return"> {
+    let Inst{12-6} = 0b1111101;
+    let Inst{2-0} = 0b101;
+  }
+
+// SA1_combine3i: Combines.
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+def V4_SA1_combine3i: SUBInst <
+  (outs DoubleRegs:$Rdd),
+  (ins u2Imm:$u2),
+  "$Rdd = combine(#3, #$u2)"> {
+    bits<3> Rdd;
+    bits<2> u2;
+
+    let Inst{12-10} = 0b111;
+    let Inst{8} = 0b0;
+    let Inst{4-3} = 0b11;
+    let Inst{2-0} = Rdd;
+    let Inst{6-5} = u2;
+  }
+
+// SS2_storebi0: Store byte.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = ByteAccess in
+def V4_SS2_storebi0: SUBInst <
+  (outs ),
+  (ins IntRegs:$Rs, u4_0Imm:$u4_0),
+  "memb($Rs + #$u4_0)=#0"> {
+    bits<4> Rs;
+    bits<4> u4_0;
+
+    let Inst{12-8} = 0b10010;
+    let Inst{7-4} = Rs;
+    let Inst{3-0} = u4_0;
+  }
+
+// SA1_clrtnew: Clear if true.
+let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedNew = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def V4_SA1_clrtnew: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins ),
+  "if (p0.new) $Rd = #0"> {
+    bits<4> Rd;
+
+    let Inst{12-9} = 0b1101;
+    let Inst{6-4} = 0b100;
+    let Inst{3-0} = Rd;
+  }
+
+// SL2_loadruh_io: Load half.
+let isCodeGenOnly = 1, mayLoad = 1, accessSize = HalfWordAccess, hasNewValue = 1, opNewValue = 0 in
+def V4_SL2_loadruh_io: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs, u3_1Imm:$u3_1),
+  "$Rd = memuh($Rs + #$u3_1)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+    bits<4> u3_1;
+
+    let Inst{12-11} = 0b01;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+    let Inst{10-8} = u3_1{3-1};
+  }
+
+// SL2_jumpr31_tnew: Indirect conditional jump if true.
+let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isPredicatedNew = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
+def V4_SL2_jumpr31_tnew: SUBInst <
+  (outs ),
+  (ins ),
+  "if (p0.new) jumpr:nt r31"> {
+    let Inst{12-6} = 0b1111111;
+    let Inst{2-0} = 0b110;
+  }
+
+// SA1_addi: Add.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0, isExtendable = 1, isExtentSigned = 1, opExtentBits = 7, opExtendable = 2 in
+def V4_SA1_addi: SUBInst <
+  (outs IntRegs:$Rx),
+  (ins IntRegs:$_src_, s7Ext:$s7),
+  "$Rx = add($_src_, #$s7)" ,
+  [] ,
+  "$_src_ = $Rx"> {
+    bits<4> Rx;
+    bits<7> s7;
+
+    let Inst{12-11} = 0b00;
+    let Inst{3-0} = Rx;
+    let Inst{10-4} = s7;
+  }
+
+// SL1_loadrub_io: Load byte.
+let isCodeGenOnly = 1, mayLoad = 1, accessSize = ByteAccess, hasNewValue = 1, opNewValue = 0 in
+def V4_SL1_loadrub_io: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs, u4_0Imm:$u4_0),
+  "$Rd = memub($Rs + #$u4_0)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+    bits<4> u4_0;
+
+    let Inst{12} = 0b1;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+    let Inst{11-8} = u4_0;
+  }
+
+// SL1_loadri_io: Load word.
+let isCodeGenOnly = 1, mayLoad = 1, accessSize = WordAccess, hasNewValue = 1, opNewValue = 0 in
+def V4_SL1_loadri_io: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs, u4_2Imm:$u4_2),
+  "$Rd = memw($Rs + #$u4_2)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+    bits<6> u4_2;
+
+    let Inst{12} = 0b0;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+    let Inst{11-8} = u4_2{5-2};
+  }
+
+// SA1_cmpeqi: Compareimmed.
+let Defs = [P0], isCodeGenOnly = 1, hasSideEffects = 0 in
+def V4_SA1_cmpeqi: SUBInst <
+  (outs ),
+  (ins IntRegs:$Rs, u2Imm:$u2),
+  "p0 = cmp.eq($Rs, #$u2)"> {
+    bits<4> Rs;
+    bits<2> u2;
+
+    let Inst{12-8} = 0b11001;
+    let Inst{7-4} = Rs;
+    let Inst{1-0} = u2;
+  }
+
+// SA1_combinerz: Combines.
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+def V4_SA1_combinerz: SUBInst <
+  (outs DoubleRegs:$Rdd),
+  (ins IntRegs:$Rs),
+  "$Rdd = combine($Rs, #0)"> {
+    bits<3> Rdd;
+    bits<4> Rs;
+
+    let Inst{12-10} = 0b111;
+    let Inst{8} = 0b1;
+    let Inst{3} = 0b1;
+    let Inst{2-0} = Rdd;
+    let Inst{7-4} = Rs;
+  }
+
+// SL2_return_t: Deallocate stack frame and return.
+// SL2_return_t -> SL2_return_tnew
+let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
+def V4_SL2_return_t: SUBInst <
+  (outs ),
+  (ins ),
+  "if (p0) dealloc_return"> {
+    let Inst{12-6} = 0b1111101;
+    let Inst{2-0} = 0b100;
+  }
+
+// SS2_allocframe: Allocate stack frame.
+let Defs = [R29, R30], Uses = [R30, R31, R29], isCodeGenOnly = 1, mayStore = 1, accessSize = DoubleWordAccess in
+def V4_SS2_allocframe: SUBInst <
+  (outs ),
+  (ins u5_3Imm:$u5_3),
+  "allocframe(#$u5_3)"> {
+    bits<8> u5_3;
+
+    let Inst{12-9} = 0b1110;
+    let Inst{8-4} = u5_3{7-3};
+  }
+
+// SS2_storeh_io: Store half.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = HalfWordAccess in
+def V4_SS2_storeh_io: SUBInst <
+  (outs ),
+  (ins IntRegs:$Rs, u3_1Imm:$u3_1, IntRegs:$Rt),
+  "memh($Rs + #$u3_1) = $Rt"> {
+    bits<4> Rs;
+    bits<4> u3_1;
+    bits<4> Rt;
+
+    let Inst{12-11} = 0b00;
+    let Inst{7-4} = Rs;
+    let Inst{10-8} = u3_1{3-1};
+    let Inst{3-0} = Rt;
+  }
+
+// SS2_storewi0: Store word.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
+def V4_SS2_storewi0: SUBInst <
+  (outs ),
+  (ins IntRegs:$Rs, u4_2Imm:$u4_2),
+  "memw($Rs + #$u4_2)=#0"> {
+    bits<4> Rs;
+    bits<6> u4_2;
+
+    let Inst{12-8} = 0b10000;
+    let Inst{7-4} = Rs;
+    let Inst{3-0} = u4_2{5-2};
+  }
+
+// SS2_storewi1: Store word.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
+def V4_SS2_storewi1: SUBInst <
+  (outs ),
+  (ins IntRegs:$Rs, u4_2Imm:$u4_2),
+  "memw($Rs + #$u4_2)=#1"> {
+    bits<4> Rs;
+    bits<6> u4_2;
+
+    let Inst{12-8} = 0b10001;
+    let Inst{7-4} = Rs;
+    let Inst{3-0} = u4_2{5-2};
+  }
+
+// SL2_jumpr31: Indirect conditional jump if true.
+let Defs = [PC], Uses = [R31], isCodeGenOnly = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
+def V4_SL2_jumpr31: SUBInst <
+  (outs ),
+  (ins ),
+  "jumpr r31"> {
+    let Inst{12-6} = 0b1111111;
+    let Inst{2} = 0b0;
+  }
+
+// SA1_combinezr: Combines.
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+def V4_SA1_combinezr: SUBInst <
+  (outs DoubleRegs:$Rdd),
+  (ins IntRegs:$Rs),
+  "$Rdd = combine(#0, $Rs)"> {
+    bits<3> Rdd;
+    bits<4> Rs;
+
+    let Inst{12-10} = 0b111;
+    let Inst{8} = 0b1;
+    let Inst{3} = 0b0;
+    let Inst{2-0} = Rdd;
+    let Inst{7-4} = Rs;
+  }
+
+// SL2_loadrh_io: Load half.
+let isCodeGenOnly = 1, mayLoad = 1, accessSize = HalfWordAccess, hasNewValue = 1, opNewValue = 0 in
+def V4_SL2_loadrh_io: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs, u3_1Imm:$u3_1),
+  "$Rd = memh($Rs + #$u3_1)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+    bits<4> u3_1;
+
+    let Inst{12-11} = 0b00;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+    let Inst{10-8} = u3_1{3-1};
+  }
+
+// SA1_addrx: Add.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def V4_SA1_addrx: SUBInst <
+  (outs IntRegs:$Rx),
+  (ins IntRegs:$_src_, IntRegs:$Rs),
+  "$Rx = add($_src_, $Rs)" ,
+  [] ,
+  "$_src_ = $Rx"> {
+    bits<4> Rx;
+    bits<4> Rs;
+
+    let Inst{12-8} = 0b11000;
+    let Inst{3-0} = Rx;
+    let Inst{7-4} = Rs;
+  }
+
+// SA1_setin1: Set to -1.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def V4_SA1_setin1: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins ),
+  "$Rd = #-1"> {
+    bits<4> Rd;
+
+    let Inst{12-9} = 0b1101;
+    let Inst{6} = 0b0;
+    let Inst{3-0} = Rd;
+  }
+
+// SA1_sxth: Sxth.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def V4_SA1_sxth: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs),
+  "$Rd = sxth($Rs)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+
+    let Inst{12-8} = 0b10100;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+  }
+
+// SA1_combine0i: Combines.
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+def V4_SA1_combine0i: SUBInst <
+  (outs DoubleRegs:$Rdd),
+  (ins u2Imm:$u2),
+  "$Rdd = combine(#0, #$u2)"> {
+    bits<3> Rdd;
+    bits<2> u2;
+
+    let Inst{12-10} = 0b111;
+    let Inst{8} = 0b0;
+    let Inst{4-3} = 0b00;
+    let Inst{2-0} = Rdd;
+    let Inst{6-5} = u2;
+  }
+
+// SA1_combine2i: Combines.
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+def V4_SA1_combine2i: SUBInst <
+  (outs DoubleRegs:$Rdd),
+  (ins u2Imm:$u2),
+  "$Rdd = combine(#2, #$u2)"> {
+    bits<3> Rdd;
+    bits<2> u2;
+
+    let Inst{12-10} = 0b111;
+    let Inst{8} = 0b0;
+    let Inst{4-3} = 0b10;
+    let Inst{2-0} = Rdd;
+    let Inst{6-5} = u2;
+  }
+
+// SA1_sxtb: Sxtb.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def V4_SA1_sxtb: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs),
+  "$Rd = sxtb($Rs)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+
+    let Inst{12-8} = 0b10101;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+  }
+
+// SA1_clrf: Clear if false.
+// SA1_clrf -> SA1_clrfnew
+let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def V4_SA1_clrf: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins ),
+  "if (!p0) $Rd = #0"> {
+    bits<4> Rd;
+
+    let Inst{12-9} = 0b1101;
+    let Inst{6-4} = 0b111;
+    let Inst{3-0} = Rd;
+  }
+
+// SL2_loadrb_io: Load byte.
+let isCodeGenOnly = 1, mayLoad = 1, accessSize = ByteAccess, hasNewValue = 1, opNewValue = 0 in
+def V4_SL2_loadrb_io: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs, u3_0Imm:$u3_0),
+  "$Rd = memb($Rs + #$u3_0)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+    bits<3> u3_0;
+
+    let Inst{12-11} = 0b10;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+    let Inst{10-8} = u3_0;
+  }
+
+// SA1_tfr: Tfr.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def V4_SA1_tfr: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs),
+  "$Rd = $Rs"> {
+    bits<4> Rd;
+    bits<4> Rs;
+
+    let Inst{12-8} = 0b10000;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+  }
+
+// SL2_loadrd_sp: Load dword.
+let Uses = [R29], isCodeGenOnly = 1, mayLoad = 1, accessSize = DoubleWordAccess in
+def V4_SL2_loadrd_sp: SUBInst <
+  (outs DoubleRegs:$Rdd),
+  (ins u5_3Imm:$u5_3),
+  "$Rdd = memd(r29 + #$u5_3)"> {
+    bits<3> Rdd;
+    bits<8> u5_3;
+
+    let Inst{12-8} = 0b11110;
+    let Inst{2-0} = Rdd;
+    let Inst{7-3} = u5_3{7-3};
+  }
+
+// SA1_and1: And #1.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def V4_SA1_and1: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs),
+  "$Rd = and($Rs, #1)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+
+    let Inst{12-8} = 0b10010;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+  }
+
+// SS2_storebi1: Store byte.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = ByteAccess in
+def V4_SS2_storebi1: SUBInst <
+  (outs ),
+  (ins IntRegs:$Rs, u4_0Imm:$u4_0),
+  "memb($Rs + #$u4_0)=#1"> {
+    bits<4> Rs;
+    bits<4> u4_0;
+
+    let Inst{12-8} = 0b10011;
+    let Inst{7-4} = Rs;
+    let Inst{3-0} = u4_0;
+  }
+
+// SA1_inc: Inc.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def V4_SA1_inc: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs),
+  "$Rd = add($Rs, #1)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+
+    let Inst{12-8} = 0b10001;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+  }
+
+// SS2_stored_sp: Store dword.
+let Uses = [R29], isCodeGenOnly = 1, mayStore = 1, accessSize = DoubleWordAccess in
+def V4_SS2_stored_sp: SUBInst <
+  (outs ),
+  (ins s6_3Imm:$s6_3, DoubleRegs:$Rtt),
+  "memd(r29 + #$s6_3) = $Rtt"> {
+    bits<9> s6_3;
+    bits<3> Rtt;
+
+    let Inst{12-9} = 0b0101;
+    let Inst{8-3} = s6_3{8-3};
+    let Inst{2-0} = Rtt;
+  }
+
+// SS2_storew_sp: Store word.
+let Uses = [R29], isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
+def V4_SS2_storew_sp: SUBInst <
+  (outs ),
+  (ins u5_2Imm:$u5_2, IntRegs:$Rt),
+  "memw(r29 + #$u5_2) = $Rt"> {
+    bits<7> u5_2;
+    bits<4> Rt;
+
+    let Inst{12-9} = 0b0100;
+    let Inst{8-4} = u5_2{6-2};
+    let Inst{3-0} = Rt;
+  }
+
+// SL2_jumpr31_fnew: Indirect conditional jump if false.
+let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
+def V4_SL2_jumpr31_fnew: SUBInst <
+  (outs ),
+  (ins ),
+  "if (!p0.new) jumpr:nt r31"> {
+    let Inst{12-6} = 0b1111111;
+    let Inst{2-0} = 0b111;
+  }
+
+// SA1_clrt: Clear if true.
+// SA1_clrt -> SA1_clrtnew
+let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def V4_SA1_clrt: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins ),
+  "if (p0) $Rd = #0"> {
+    bits<4> Rd;
+
+    let Inst{12-9} = 0b1101;
+    let Inst{6-4} = 0b110;
+    let Inst{3-0} = Rd;
+  }
+
+// SL2_return: Deallocate stack frame and return.
+let Defs = [PC, R31, R29, R30], Uses = [R30], isCodeGenOnly = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
+def V4_SL2_return: SUBInst <
+  (outs ),
+  (ins ),
+  "dealloc_return"> {
+    let Inst{12-6} = 0b1111101;
+    let Inst{2} = 0b0;
+  }
+
+// SA1_dec: Dec.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def V4_SA1_dec: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs),
+  "$Rd = add($Rs,#-1)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+
+    let Inst{12-8} = 0b10011;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+  }
+
+// SA1_seti: Set immed.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0, isExtendable = 1, isExtentSigned = 0, opExtentBits = 6, opExtendable = 1 in
+def V4_SA1_seti: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins u6Ext:$u6),
+  "$Rd = #$u6"> {
+    bits<4> Rd;
+    bits<6> u6;
+
+    let Inst{12-10} = 0b010;
+    let Inst{3-0} = Rd;
+    let Inst{9-4} = u6;
+  }
+
+// SL2_jumpr31_t: Indirect conditional jump if true.
+// SL2_jumpr31_t -> SL2_jumpr31_tnew
+let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
+def V4_SL2_jumpr31_t: SUBInst <
+  (outs ),
+  (ins ),
+  "if (p0) jumpr r31"> {
+    let Inst{12-6} = 0b1111111;
+    let Inst{2-0} = 0b100;
+  }
+
+// SA1_clrfnew: Clear if false.
+let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def V4_SA1_clrfnew: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins ),
+  "if (!p0.new) $Rd = #0"> {
+    bits<4> Rd;
+
+    let Inst{12-9} = 0b1101;
+    let Inst{6-4} = 0b101;
+    let Inst{3-0} = Rd;
+  }
+
+// SS1_storew_io: Store word.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
+def V4_SS1_storew_io: SUBInst <
+  (outs ),
+  (ins IntRegs:$Rs, u4_2Imm:$u4_2, IntRegs:$Rt),
+  "memw($Rs + #$u4_2) = $Rt"> {
+    bits<4> Rs;
+    bits<6> u4_2;
+    bits<4> Rt;
+
+    let Inst{12} = 0b0;
+    let Inst{7-4} = Rs;
+    let Inst{11-8} = u4_2{5-2};
+    let Inst{3-0} = Rt;
+  }
+
+// SA1_zxtb: Zxtb.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def V4_SA1_zxtb: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs),
+  "$Rd = and($Rs, #255)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+
+    let Inst{12-8} = 0b10111;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+  }
+
+// SA1_addsp: Add.
+let Uses = [R29], isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def V4_SA1_addsp: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins u6_2Imm:$u6_2),
+  "$Rd = add(r29, #$u6_2)"> {
+    bits<4> Rd;
+    bits<8> u6_2;
+
+    let Inst{12-10} = 0b011;
+    let Inst{3-0} = Rd;
+    let Inst{9-4} = u6_2{7-2};
+  }
+
+// SL2_loadri_sp: Load word.
+let Uses = [R29], isCodeGenOnly = 1, mayLoad = 1, accessSize = WordAccess, hasNewValue = 1, opNewValue = 0 in
+def V4_SL2_loadri_sp: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins u5_2Imm:$u5_2),
+  "$Rd = memw(r29 + #$u5_2)"> {
+    bits<4> Rd;
+    bits<7> u5_2;
+
+    let Inst{12-9} = 0b1110;
+    let Inst{3-0} = Rd;
+    let Inst{8-4} = u5_2{6-2};
+  }
+
+// SS1_storeb_io: Store byte.
+let isCodeGenOnly = 1, mayStore = 1, accessSize = ByteAccess in
+def V4_SS1_storeb_io: SUBInst <
+  (outs ),
+  (ins IntRegs:$Rs, u4_0Imm:$u4_0, IntRegs:$Rt),
+  "memb($Rs + #$u4_0) = $Rt"> {
+    bits<4> Rs;
+    bits<4> u4_0;
+    bits<4> Rt;
+
+    let Inst{12} = 0b1;
+    let Inst{7-4} = Rs;
+    let Inst{11-8} = u4_0;
+    let Inst{3-0} = Rt;
+  }
+
+// SL2_return_tnew: Deallocate stack frame and return.
+let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedNew = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
+def V4_SL2_return_tnew: SUBInst <
+  (outs ),
+  (ins ),
+  "if (p0.new) dealloc_return:nt"> {
+    let Inst{12-6} = 0b1111101;
+    let Inst{2-0} = 0b110;
+  }
+
+// SL2_return_fnew: Deallocate stack frame and return.
+let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
+def V4_SL2_return_fnew: SUBInst <
+  (outs ),
+  (ins ),
+  "if (!p0.new) dealloc_return:nt"> {
+    let Inst{12-6} = 0b1111101;
+    let Inst{2-0} = 0b111;
+  }
+
+// SA1_zxth: Zxth.
+let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
+def V4_SA1_zxth: SUBInst <
+  (outs IntRegs:$Rd),
+  (ins IntRegs:$Rs),
+  "$Rd = zxth($Rs)"> {
+    bits<4> Rd;
+    bits<4> Rs;
+
+    let Inst{12-8} = 0b10110;
+    let Inst{3-0} = Rd;
+    let Inst{7-4} = Rs;
+  }
+
diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp
index 535d1f91b493..75189b696ea2 100644
--- a/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -15,9 +15,12 @@
 #include "Hexagon.h"
 #include "HexagonAsmPrinter.h"
 #include "HexagonMachineFunctionInfo.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 
@@ -28,19 +31,30 @@ static MCOperand GetSymbolRef(const MachineOperand& MO, const MCSymbol* Symbol,
   MCContext &MC = Printer.OutContext;
   const MCExpr *ME;
 
-  ME = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, MC);
+  ME = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, MC);
 
   if (!MO.isJTI() && MO.getOffset())
-    ME = MCBinaryExpr::CreateAdd(ME, MCConstantExpr::Create(MO.getOffset(), MC),
+    ME = MCBinaryExpr::createAdd(ME, MCConstantExpr::create(MO.getOffset(), MC),
                                  MC);
 
   return (MCOperand::createExpr(ME));
 }
 
 // Create an MCInst from a MachineInstr
-void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCI,
+void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCB,
                             HexagonAsmPrinter& AP) {
-  MCI.setOpcode(MI->getOpcode());
+  if(MI->getOpcode() == Hexagon::ENDLOOP0){
+    HexagonMCInstrInfo::setInnerLoop(MCB);
+    return;
+  }
+  if(MI->getOpcode() == Hexagon::ENDLOOP1){
+    HexagonMCInstrInfo::setOuterLoop(MCB);
+    return;
+  }
+  MCInst* MCI = new (AP.OutContext) MCInst;
+  MCI->setOpcode(MI->getOpcode());
+  assert(MCI->getOpcode() == static_cast<unsigned>(MI->getOpcode()) &&
+         "MCI opcode should have been set on construction");
 
   for (unsigned i = 0, e = MI->getNumOperands(); i < e; i++) {
     const MachineOperand &MO = MI->getOperand(i);
@@ -67,7 +81,7 @@ void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCI,
       break;
     case MachineOperand::MO_MachineBasicBlock:
       MCO = MCOperand::createExpr
-              (MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(),
+              (MCSymbolRefExpr::create(MO.getMBB()->getSymbol(),
                AP.OutContext));
       break;
     case MachineOperand::MO_GlobalAddress:
@@ -88,6 +102,7 @@ void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCI,
       break;
     }
 
-    MCI.addOperand(MCO);
+    MCI->addOperand(MCO);
   }
+  MCB.addOperand(MCOperand::createInst(MCI));
 }
diff --git a/lib/Target/Hexagon/HexagonOperands.td b/lib/Target/Hexagon/HexagonOperands.td
index b7f364ef0751..be8204b7de53 100644
--- a/lib/Target/Hexagon/HexagonOperands.td
+++ b/lib/Target/Hexagon/HexagonOperands.td
@@ -27,6 +27,7 @@ let PrintMethod = "printImmOperand" in {
   def s8Imm : Operand<i32>;
   def s8Imm64 : Operand<i64>;
   def s6Imm : Operand<i32>;
+  def s6_3Imm : Operand<i32>;
   def s4Imm : Operand<i32>;
   def s4_0Imm : Operand<i32>;
   def s4_1Imm : Operand<i32>;
@@ -51,8 +52,14 @@ let PrintMethod = "printImmOperand" in {
   def u6_2Imm : Operand<i32>;
   def u6_3Imm : Operand<i32>;
   def u5Imm : Operand<i32>;
+  def u5_2Imm : Operand<i32>;
+  def u5_3Imm : Operand<i32>;
   def u4Imm : Operand<i32>;
+  def u4_0Imm : Operand<i32>;
+  def u4_2Imm : Operand<i32>;
   def u3Imm : Operand<i32>;
+  def u3_0Imm : Operand<i32>;
+  def u3_1Imm : Operand<i32>;
   def u2Imm : Operand<i32>;
   def u1Imm : Operand<i32>;
   def n8Imm : Operand<i32>;
@@ -444,6 +451,7 @@ let PrintMethod = "printExtOperand" in {
   def s10Ext : Operand<i32>;
   def s9Ext : Operand<i32>;
   def s8Ext : Operand<i32>;
+  def s7Ext : Operand<i32>;
   def s6Ext : Operand<i32>;
   def s11_0Ext : Operand<i32>;
   def s11_1Ext : Operand<i32>;
diff --git a/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt b/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt
index 4c987ed32a64..6253686b4993 100644
--- a/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt
@@ -4,8 +4,12 @@ add_llvm_library(LLVMHexagonDesc
   HexagonInstPrinter.cpp
   HexagonMCAsmInfo.cpp
   HexagonMCCodeEmitter.cpp
+  HexagonMCCompound.cpp
+  HexagonMCDuplexInfo.cpp
   HexagonMCInstrInfo.cpp
+  HexagonMCShuffler.cpp
   HexagonMCTargetDesc.cpp
+  HexagonShuffler.cpp
   )
 
 add_dependencies(LLVMHexagonDesc HexagonCommonTableGen)
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 155aa9ef9557..76894840153d 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -7,19 +7,150 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "Hexagon.h"
+#include "HexagonFixupKinds.h"
 #include "HexagonMCTargetDesc.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
+using namespace Hexagon;
 
 namespace {
 
 class HexagonAsmBackend : public MCAsmBackend {
+  uint8_t OSABI;
+  StringRef CPU;
+  mutable uint64_t relaxedCnt;
+  std::unique_ptr <MCInstrInfo> MCII;
+  std::unique_ptr <MCInst *> RelaxTarget;
 public:
-  HexagonAsmBackend(Target const & /*T*/) {}
+  HexagonAsmBackend(Target const &T,  uint8_t OSABI, StringRef CPU) :
+    OSABI(OSABI), MCII (T.createMCInstrInfo()), RelaxTarget(new MCInst *){}
 
-  unsigned getNumFixupKinds() const override { return 0; }
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createHexagonELFObjectWriter(OS, OSABI, CPU);
+  }
+
+  unsigned getNumFixupKinds() const override {
+    return Hexagon::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+    const static MCFixupKindInfo Infos[Hexagon::NumTargetFixupKinds] = {
+        // This table *must* be in same the order of fixup_* kinds in
+        // HexagonFixupKinds.h.
+        //
+        // namei                          offset  bits    flags
+        {"fixup_Hexagon_B22_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"fixup_Hexagon_B15_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"fixup_Hexagon_B7_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"fixup_Hexagon_LO16", 0, 32, 0},
+        {"fixup_Hexagon_HI16", 0, 32, 0},
+        {"fixup_Hexagon_32", 0, 32, 0},
+        {"fixup_Hexagon_16", 0, 32, 0},
+        {"fixup_Hexagon_8", 0, 32, 0},
+        {"fixup_Hexagon_GPREL16_0", 0, 32, 0},
+        {"fixup_Hexagon_GPREL16_1", 0, 32, 0},
+        {"fixup_Hexagon_GPREL16_2", 0, 32, 0},
+        {"fixup_Hexagon_GPREL16_3", 0, 32, 0},
+        {"fixup_Hexagon_HL16", 0, 32, 0},
+        {"fixup_Hexagon_B13_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"fixup_Hexagon_B9_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"fixup_Hexagon_B32_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"fixup_Hexagon_32_6_X", 0, 32, 0},
+        {"fixup_Hexagon_B22_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"fixup_Hexagon_B15_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"fixup_Hexagon_B13_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"fixup_Hexagon_B9_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"fixup_Hexagon_B7_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"fixup_Hexagon_16_X", 0, 32, 0},
+        {"fixup_Hexagon_12_X", 0, 32, 0},
+        {"fixup_Hexagon_11_X", 0, 32, 0},
+        {"fixup_Hexagon_10_X", 0, 32, 0},
+        {"fixup_Hexagon_9_X", 0, 32, 0},
+        {"fixup_Hexagon_8_X", 0, 32, 0},
+        {"fixup_Hexagon_7_X", 0, 32, 0},
+        {"fixup_Hexagon_6_X", 0, 32, 0},
+        {"fixup_Hexagon_32_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"fixup_Hexagon_COPY", 0, 32, 0},
+        {"fixup_Hexagon_GLOB_DAT", 0, 32, 0},
+        {"fixup_Hexagon_JMP_SLOT", 0, 32, 0},
+        {"fixup_Hexagon_RELATIVE", 0, 32, 0},
+        {"fixup_Hexagon_PLT_B22_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"fixup_Hexagon_GOTREL_LO16", 0, 32, 0},
+        {"fixup_Hexagon_GOTREL_HI16", 0, 32, 0},
+        {"fixup_Hexagon_GOTREL_32", 0, 32, 0},
+        {"fixup_Hexagon_GOT_LO16", 0, 32, 0},
+        {"fixup_Hexagon_GOT_HI16", 0, 32, 0},
+        {"fixup_Hexagon_GOT_32", 0, 32, 0},
+        {"fixup_Hexagon_GOT_16", 0, 32, 0},
+        {"fixup_Hexagon_DTPMOD_32", 0, 32, 0},
+        {"fixup_Hexagon_DTPREL_LO16", 0, 32, 0},
+        {"fixup_Hexagon_DTPREL_HI16", 0, 32, 0},
+        {"fixup_Hexagon_DTPREL_32", 0, 32, 0},
+        {"fixup_Hexagon_DTPREL_16", 0, 32, 0},
+        {"fixup_Hexagon_GD_PLT_B22_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"fixup_Hexagon_LD_PLT_B22_PCREL", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"fixup_Hexagon_GD_GOT_LO16", 0, 32, 0},
+        {"fixup_Hexagon_GD_GOT_HI16", 0, 32, 0},
+        {"fixup_Hexagon_GD_GOT_32", 0, 32, 0},
+        {"fixup_Hexagon_GD_GOT_16", 0, 32, 0},
+        {"fixup_Hexagon_LD_GOT_LO16", 0, 32, 0},
+        {"fixup_Hexagon_LD_GOT_HI16", 0, 32, 0},
+        {"fixup_Hexagon_LD_GOT_32", 0, 32, 0},
+        {"fixup_Hexagon_LD_GOT_16", 0, 32, 0},
+        {"fixup_Hexagon_IE_LO16", 0, 32, 0},
+        {"fixup_Hexagon_IE_HI16", 0, 32, 0},
+        {"fixup_Hexagon_IE_32", 0, 32, 0},
+        {"fixup_Hexagon_IE_16", 0, 32, 0},
+        {"fixup_Hexagon_IE_GOT_LO16", 0, 32, 0},
+        {"fixup_Hexagon_IE_GOT_HI16", 0, 32, 0},
+        {"fixup_Hexagon_IE_GOT_32", 0, 32, 0},
+        {"fixup_Hexagon_IE_GOT_16", 0, 32, 0},
+        {"fixup_Hexagon_TPREL_LO16", 0, 32, 0},
+        {"fixup_Hexagon_TPREL_HI16", 0, 32, 0},
+        {"fixup_Hexagon_TPREL_32", 0, 32, 0},
+        {"fixup_Hexagon_TPREL_16", 0, 32, 0},
+        {"fixup_Hexagon_6_PCREL_X", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"fixup_Hexagon_GOTREL_32_6_X", 0, 32, 0},
+        {"fixup_Hexagon_GOTREL_16_X", 0, 32, 0},
+        {"fixup_Hexagon_GOTREL_11_X", 0, 32, 0},
+        {"fixup_Hexagon_GOT_32_6_X", 0, 32, 0},
+        {"fixup_Hexagon_GOT_16_X", 0, 32, 0},
+        {"fixup_Hexagon_GOT_11_X", 0, 32, 0},
+        {"fixup_Hexagon_DTPREL_32_6_X", 0, 32, 0},
+        {"fixup_Hexagon_DTPREL_16_X", 0, 32, 0},
+        {"fixup_Hexagon_DTPREL_11_X", 0, 32, 0},
+        {"fixup_Hexagon_GD_GOT_32_6_X", 0, 32, 0},
+        {"fixup_Hexagon_GD_GOT_16_X", 0, 32, 0},
+        {"fixup_Hexagon_GD_GOT_11_X", 0, 32, 0},
+        {"fixup_Hexagon_LD_GOT_32_6_X", 0, 32, 0},
+        {"fixup_Hexagon_LD_GOT_16_X", 0, 32, 0},
+        {"fixup_Hexagon_LD_GOT_11_X", 0, 32, 0},
+        {"fixup_Hexagon_IE_32_6_X", 0, 32, 0},
+        {"fixup_Hexagon_IE_16_X", 0, 32, 0},
+        {"fixup_Hexagon_IE_GOT_32_6_X", 0, 32, 0},
+        {"fixup_Hexagon_IE_GOT_16_X", 0, 32, 0},
+        {"fixup_Hexagon_IE_GOT_11_X", 0, 32, 0},
+        {"fixup_Hexagon_TPREL_32_6_X", 0, 32, 0},
+        {"fixup_Hexagon_TPREL_16_X", 0, 32, 0},
+        {"fixup_Hexagon_TPREL_11_X", 0, 32, 0}};
+
+    if (Kind < FirstTargetFixupKind) {
+      return MCAsmBackend::getFixupKindInfo(Kind);
+    }
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
 
   void applyFixup(MCFixup const & /*Fixup*/, char * /*Data*/,
                   unsigned /*DataSize*/, uint64_t /*Value*/,
@@ -27,14 +158,119 @@ public:
     return;
   }
 
-  bool mayNeedRelaxation(MCInst const & /*Inst*/) const override {
+  bool isInstRelaxable(MCInst const &HMI) const {
+    const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(*MCII, HMI);
+    bool Relaxable = false;
+    // Branches and loop-setup insns are handled as necessary by relaxation.
+    if (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeJ ||
+        (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeNV &&
+         MCID.isBranch()) ||
+        (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeCR &&
+         HMI.getOpcode() != Hexagon::C4_addipc))
+      if (HexagonMCInstrInfo::isExtendable(*MCII, HMI))
+        Relaxable = true;
+
+    return Relaxable;
+  }
+
+  /// MayNeedRelaxation - Check whether the given instruction may need
+  /// relaxation.
+  ///
+  /// \param Inst - The instruction to test.
+  bool mayNeedRelaxation(MCInst const &Inst) const override {
+    assert(HexagonMCInstrInfo::isBundle(Inst));
+    bool PreviousIsExtender = false;
+    for (auto const &I : HexagonMCInstrInfo::bundleInstructions(Inst)) {
+      auto const &Inst = *I.getInst();
+      if (!PreviousIsExtender) {
+        if (isInstRelaxable(Inst))
+          return true;
+      }
+      PreviousIsExtender = HexagonMCInstrInfo::isImmext(Inst);
+    }
     return false;
   }
 
-  bool fixupNeedsRelaxation(MCFixup const & /*Fixup*/, uint64_t /*Value*/,
-                            MCRelaxableFragment const * /*DF*/,
-                            MCAsmLayout const & /*Layout*/) const override {
-    llvm_unreachable("fixupNeedsRelaxation() unimplemented");
+  /// fixupNeedsRelaxation - Target specific predicate for whether a given
+  /// fixup requires the associated instruction to be relaxed.
+  bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, bool Resolved,
+                                    uint64_t Value,
+                                    const MCRelaxableFragment *DF,
+                                    const MCAsmLayout &Layout) const override {
+    MCInst const &MCB = DF->getInst();
+    assert(HexagonMCInstrInfo::isBundle(MCB));
+
+    *RelaxTarget = nullptr;
+    MCInst &MCI = const_cast<MCInst &>(HexagonMCInstrInfo::instruction(
+        MCB, Fixup.getOffset() / HEXAGON_INSTR_SIZE));
+    // If we cannot resolve the fixup value, it requires relaxation.
+    if (!Resolved) {
+      switch ((unsigned)Fixup.getKind()) {
+      case fixup_Hexagon_B22_PCREL:
+      // GetFixupCount assumes B22 won't relax
+      // Fallthrough
+      default:
+        return false;
+        break;
+      case fixup_Hexagon_B13_PCREL:
+      case fixup_Hexagon_B15_PCREL:
+      case fixup_Hexagon_B9_PCREL:
+      case fixup_Hexagon_B7_PCREL: {
+        if (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_SIZE) {
+          ++relaxedCnt;
+          *RelaxTarget = &MCI;
+          return true;
+        } else {
+          return false;
+        }
+        break;
+      }
+      }
+    }
+    bool Relaxable = isInstRelaxable(MCI);
+    if (Relaxable == false)
+      return false;
+
+    MCFixupKind Kind = Fixup.getKind();
+    int64_t sValue = Value;
+    int64_t maxValue;
+
+    switch ((unsigned)Kind) {
+    case fixup_Hexagon_B7_PCREL:
+      maxValue = 1 << 8;
+      break;
+    case fixup_Hexagon_B9_PCREL:
+      maxValue = 1 << 10;
+      break;
+    case fixup_Hexagon_B15_PCREL:
+      maxValue = 1 << 16;
+      break;
+    case fixup_Hexagon_B22_PCREL:
+      maxValue = 1 << 23;
+      break;
+    default:
+      maxValue = INT64_MAX;
+      break;
+    }
+
+    bool isFarAway = -maxValue > sValue || sValue > maxValue - 1;
+
+    if (isFarAway) {
+      if (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_SIZE) {
+        ++relaxedCnt;
+        *RelaxTarget = &MCI;
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  /// Simple predicate for targets where !Resolved implies requiring relaxation
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    llvm_unreachable("Handled by fixupNeedsRelaxationAdvanced");
   }
 
   void relaxInstruction(MCInst const & /*Inst*/,
@@ -49,26 +285,11 @@ public:
 };
 } // end anonymous namespace
 
-namespace {
-class ELFHexagonAsmBackend : public HexagonAsmBackend {
-  uint8_t OSABI;
-
-public:
-  ELFHexagonAsmBackend(Target const &T, uint8_t OSABI)
-      : HexagonAsmBackend(T), OSABI(OSABI) {}
-
-  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
-    StringRef CPU("HexagonV4");
-    return createHexagonELFObjectWriter(OS, OSABI, CPU);
-  }
-};
-} // end anonymous namespace
-
 namespace llvm {
 MCAsmBackend *createHexagonAsmBackend(Target const &T,
                                       MCRegisterInfo const & /*MRI*/,
-                                      StringRef TT, StringRef /*CPU*/) {
+                                      StringRef TT, StringRef CPU) {
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
-  return new ELFHexagonAsmBackend(T, OSABI);
+  return new HexagonAsmBackend(T, OSABI, CPU);
 }
 }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index 6a72f205e9d3..f4d162ccf6a8 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -43,6 +43,7 @@ namespace HexagonII {
     TypeXTYPE   = 8,
     TypeMEMOP   = 9,
     TypeNV      = 10,
+    TypeDUPLEX  = 11,
     TypePREFIX  = 30, // Such as extenders.
     TypeENDLOOP = 31  // Such as end of a HW loop.
   };
@@ -190,7 +191,26 @@ namespace HexagonII {
     MO_GPREL
   };
 
-  enum class InstParseBits : uint32_t {
+  // Hexagon Sub-instruction classes.
+  enum SubInstructionGroup {
+    HSIG_None = 0,
+    HSIG_L1,
+    HSIG_L2,
+    HSIG_S1,
+    HSIG_S2,
+    HSIG_A,
+    HSIG_Compound
+  };
+
+  // Hexagon Compound classes.
+  enum CompoundGroup {
+    HCG_None = 0,
+    HCG_A,
+    HCG_B,
+    HCG_C
+  };
+
+  enum InstParseBits {
     INST_PARSE_MASK       = 0x0000c000,
     INST_PARSE_PACKET_END = 0x0000c000,
     INST_PARSE_LOOP_END   = 0x00008000,
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
index fde935b2758b..843072302b21 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Hexagon.h"
+#include "MCTargetDesc/HexagonFixupKinds.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/Support/Debug.h"
@@ -40,17 +41,306 @@ HexagonELFObjectWriter::HexagonELFObjectWriter(uint8_t OSABI, StringRef C)
 unsigned HexagonELFObjectWriter::GetRelocType(MCValue const &/*Target*/,
                                               MCFixup const &Fixup,
                                               bool IsPCRel) const {
+  // determine the type of the relocation
   unsigned Type = (unsigned)ELF::R_HEX_NONE;
-  llvm::MCFixupKind Kind = Fixup.getKind();
+  unsigned Kind = (unsigned)Fixup.getKind();
 
   switch (Kind) {
-  default:
-    DEBUG(dbgs() << "unrecognized relocation " << Fixup.getKind() << "\n");
-    llvm_unreachable("Unimplemented Fixup kind!");
-    break;
-  case FK_Data_4:
-    Type = (IsPCRel) ? ELF::R_HEX_32_PCREL : ELF::R_HEX_32;
-    break;
+    default:
+      DEBUG(dbgs() << "unrecognized relocation " << Fixup.getKind() << "\n");
+      llvm_unreachable("Unimplemented Fixup kind!");
+      break;
+    case FK_Data_4:
+      Type = (IsPCRel) ? ELF::R_HEX_32_PCREL : ELF::R_HEX_32;
+      break;
+    case FK_PCRel_4:
+      Type = ELF::R_HEX_32_PCREL;
+      break;
+    case FK_Data_2:
+      Type = ELF::R_HEX_16;
+      break;
+   case FK_Data_1:
+      Type = ELF::R_HEX_8;
+      break;
+    case fixup_Hexagon_B22_PCREL:
+      Type = ELF::R_HEX_B22_PCREL;
+      break;
+    case fixup_Hexagon_B15_PCREL:
+      Type = ELF::R_HEX_B15_PCREL;
+      break;
+    case fixup_Hexagon_B7_PCREL:
+      Type = ELF::R_HEX_B7_PCREL;
+      break;
+    case fixup_Hexagon_LO16:
+      Type = ELF::R_HEX_LO16;
+      break;
+    case fixup_Hexagon_HI16:
+      Type = ELF::R_HEX_HI16;
+      break;
+    case fixup_Hexagon_32:
+      Type = ELF::R_HEX_32;
+      break;
+    case fixup_Hexagon_16:
+      Type = ELF::R_HEX_16;
+      break;
+    case fixup_Hexagon_8:
+      Type = ELF::R_HEX_8;
+      break;
+    case fixup_Hexagon_GPREL16_0:
+      Type = ELF::R_HEX_GPREL16_0;
+      break;
+    case fixup_Hexagon_GPREL16_1:
+      Type = ELF::R_HEX_GPREL16_1;
+      break;
+    case fixup_Hexagon_GPREL16_2:
+      Type = ELF::R_HEX_GPREL16_2;
+      break;
+    case fixup_Hexagon_GPREL16_3:
+      Type = ELF::R_HEX_GPREL16_3;
+      break;
+    case fixup_Hexagon_HL16:
+      Type = ELF::R_HEX_HL16;
+      break;
+    case fixup_Hexagon_B13_PCREL:
+      Type = ELF::R_HEX_B13_PCREL;
+      break;
+    case fixup_Hexagon_B9_PCREL:
+      Type = ELF::R_HEX_B9_PCREL;
+      break;
+    case fixup_Hexagon_B32_PCREL_X:
+      Type = ELF::R_HEX_B32_PCREL_X;
+      break;
+    case fixup_Hexagon_32_6_X:
+      Type = ELF::R_HEX_32_6_X;
+      break;
+    case fixup_Hexagon_B22_PCREL_X:
+      Type = ELF::R_HEX_B22_PCREL_X;
+      break;
+    case fixup_Hexagon_B15_PCREL_X:
+      Type = ELF::R_HEX_B15_PCREL_X;
+      break;
+    case fixup_Hexagon_B13_PCREL_X:
+      Type = ELF::R_HEX_B13_PCREL_X;
+      break;
+    case fixup_Hexagon_B9_PCREL_X:
+      Type = ELF::R_HEX_B9_PCREL_X;
+      break;
+    case fixup_Hexagon_B7_PCREL_X:
+      Type = ELF::R_HEX_B7_PCREL_X;
+      break;
+    case fixup_Hexagon_16_X:
+      Type = ELF::R_HEX_16_X;
+      break;
+    case fixup_Hexagon_12_X:
+      Type = ELF::R_HEX_12_X;
+      break;
+    case fixup_Hexagon_11_X:
+      Type = ELF::R_HEX_11_X;
+      break;
+    case fixup_Hexagon_10_X:
+      Type = ELF::R_HEX_10_X;
+      break;
+    case fixup_Hexagon_9_X:
+      Type = ELF::R_HEX_9_X;
+      break;
+    case fixup_Hexagon_8_X:
+      Type = ELF::R_HEX_8_X;
+      break;
+    case fixup_Hexagon_7_X:
+      Type = ELF::R_HEX_7_X;
+      break;
+    case fixup_Hexagon_6_X:
+      Type = ELF::R_HEX_6_X;
+      break;
+    case fixup_Hexagon_32_PCREL:
+      Type = ELF::R_HEX_32_PCREL;
+      break;
+    case fixup_Hexagon_COPY:
+      Type = ELF::R_HEX_COPY;
+      break;
+    case fixup_Hexagon_GLOB_DAT:
+      Type = ELF::R_HEX_GLOB_DAT;
+      break;
+    case fixup_Hexagon_JMP_SLOT:
+      Type = ELF::R_HEX_JMP_SLOT;
+      break;
+    case fixup_Hexagon_RELATIVE:
+      Type = ELF::R_HEX_RELATIVE;
+      break;
+    case fixup_Hexagon_PLT_B22_PCREL:
+      Type = ELF::R_HEX_PLT_B22_PCREL;
+      break;
+    case fixup_Hexagon_GOTREL_LO16:
+      Type = ELF::R_HEX_GOTREL_LO16;
+      break;
+    case fixup_Hexagon_GOTREL_HI16:
+      Type = ELF::R_HEX_GOTREL_HI16;
+      break;
+    case fixup_Hexagon_GOTREL_32:
+      Type = ELF::R_HEX_GOTREL_32;
+      break;
+    case fixup_Hexagon_GOT_LO16:
+      Type = ELF::R_HEX_GOT_LO16;
+      break;
+    case fixup_Hexagon_GOT_HI16:
+      Type = ELF::R_HEX_GOT_HI16;
+      break;
+    case fixup_Hexagon_GOT_32:
+      Type = ELF::R_HEX_GOT_32;
+      break;
+    case fixup_Hexagon_GOT_16:
+      Type = ELF::R_HEX_GOT_16;
+      break;
+    case fixup_Hexagon_DTPMOD_32:
+      Type = ELF::R_HEX_DTPMOD_32;
+      break;
+    case fixup_Hexagon_DTPREL_LO16:
+      Type = ELF::R_HEX_DTPREL_LO16;
+      break;
+    case fixup_Hexagon_DTPREL_HI16:
+      Type = ELF::R_HEX_DTPREL_HI16;
+      break;
+    case fixup_Hexagon_DTPREL_32:
+      Type = ELF::R_HEX_DTPREL_32;
+      break;
+    case fixup_Hexagon_DTPREL_16:
+      Type = ELF::R_HEX_DTPREL_16;
+      break;
+    case fixup_Hexagon_GD_PLT_B22_PCREL:
+      Type = ELF::R_HEX_GD_PLT_B22_PCREL;
+      break;
+    case fixup_Hexagon_LD_PLT_B22_PCREL:
+      Type = ELF::R_HEX_LD_PLT_B22_PCREL;
+      break;
+    case fixup_Hexagon_GD_GOT_LO16:
+      Type = ELF::R_HEX_GD_GOT_LO16;
+      break;
+    case fixup_Hexagon_GD_GOT_HI16:
+      Type = ELF::R_HEX_GD_GOT_HI16;
+      break;
+    case fixup_Hexagon_GD_GOT_32:
+      Type = ELF::R_HEX_GD_GOT_32;
+      break;
+    case fixup_Hexagon_GD_GOT_16:
+      Type = ELF::R_HEX_GD_GOT_16;
+      break;
+    case fixup_Hexagon_LD_GOT_LO16:
+      Type = ELF::R_HEX_LD_GOT_LO16;
+      break;
+    case fixup_Hexagon_LD_GOT_HI16:
+      Type = ELF::R_HEX_LD_GOT_HI16;
+      break;
+    case fixup_Hexagon_LD_GOT_32:
+      Type = ELF::R_HEX_LD_GOT_32;
+      break;
+    case fixup_Hexagon_LD_GOT_16:
+      Type = ELF::R_HEX_LD_GOT_16;
+      break;
+    case fixup_Hexagon_IE_LO16:
+      Type = ELF::R_HEX_IE_LO16;
+      break;
+    case fixup_Hexagon_IE_HI16:
+      Type = ELF::R_HEX_IE_HI16;
+      break;
+    case fixup_Hexagon_IE_32:
+      Type = ELF::R_HEX_IE_32;
+      break;
+    case fixup_Hexagon_IE_GOT_LO16:
+      Type = ELF::R_HEX_IE_GOT_LO16;
+      break;
+    case fixup_Hexagon_IE_GOT_HI16:
+      Type = ELF::R_HEX_IE_GOT_HI16;
+      break;
+    case fixup_Hexagon_IE_GOT_32:
+      Type = ELF::R_HEX_IE_GOT_32;
+      break;
+    case fixup_Hexagon_IE_GOT_16:
+      Type = ELF::R_HEX_IE_GOT_16;
+      break;
+    case fixup_Hexagon_TPREL_LO16:
+      Type = ELF::R_HEX_TPREL_LO16;
+      break;
+    case fixup_Hexagon_TPREL_HI16:
+      Type = ELF::R_HEX_TPREL_HI16;
+      break;
+    case fixup_Hexagon_TPREL_32:
+      Type = ELF::R_HEX_TPREL_32;
+      break;
+    case fixup_Hexagon_TPREL_16:
+      Type = ELF::R_HEX_TPREL_16;
+      break;
+    case fixup_Hexagon_6_PCREL_X:
+      Type = ELF::R_HEX_6_PCREL_X;
+      break;
+    case fixup_Hexagon_GOTREL_32_6_X:
+      Type = ELF::R_HEX_GOTREL_32_6_X;
+      break;
+    case fixup_Hexagon_GOTREL_16_X:
+      Type = ELF::R_HEX_GOTREL_16_X;
+      break;
+    case fixup_Hexagon_GOTREL_11_X:
+      Type = ELF::R_HEX_GOTREL_11_X;
+      break;
+    case fixup_Hexagon_GOT_32_6_X:
+      Type = ELF::R_HEX_GOT_32_6_X;
+      break;
+    case fixup_Hexagon_GOT_16_X:
+      Type = ELF::R_HEX_GOT_16_X;
+      break;
+    case fixup_Hexagon_GOT_11_X:
+      Type = ELF::R_HEX_GOT_11_X;
+      break;
+    case fixup_Hexagon_DTPREL_32_6_X:
+      Type = ELF::R_HEX_DTPREL_32_6_X;
+      break;
+    case fixup_Hexagon_DTPREL_16_X:
+      Type = ELF::R_HEX_DTPREL_16_X;
+      break;
+    case fixup_Hexagon_DTPREL_11_X:
+      Type = ELF::R_HEX_DTPREL_11_X;
+      break;
+    case fixup_Hexagon_GD_GOT_32_6_X:
+      Type = ELF::R_HEX_GD_GOT_32_6_X;
+      break;
+    case fixup_Hexagon_GD_GOT_16_X:
+      Type = ELF::R_HEX_GD_GOT_16_X;
+      break;
+    case fixup_Hexagon_GD_GOT_11_X:
+      Type = ELF::R_HEX_GD_GOT_11_X;
+      break;
+    case fixup_Hexagon_LD_GOT_32_6_X:
+      Type = ELF::R_HEX_LD_GOT_32_6_X;
+      break;
+    case fixup_Hexagon_LD_GOT_16_X:
+      Type = ELF::R_HEX_LD_GOT_16_X;
+      break;
+    case fixup_Hexagon_LD_GOT_11_X:
+      Type = ELF::R_HEX_LD_GOT_11_X;
+      break;
+    case fixup_Hexagon_IE_32_6_X:
+      Type = ELF::R_HEX_IE_32_6_X;
+      break;
+    case fixup_Hexagon_IE_16_X:
+      Type = ELF::R_HEX_IE_16_X;
+      break;
+    case fixup_Hexagon_IE_GOT_32_6_X:
+      Type = ELF::R_HEX_IE_GOT_32_6_X;
+      break;
+    case fixup_Hexagon_IE_GOT_16_X:
+      Type = ELF::R_HEX_IE_GOT_16_X;
+      break;
+    case fixup_Hexagon_IE_GOT_11_X:
+      Type = ELF::R_HEX_IE_GOT_11_X;
+      break;
+    case fixup_Hexagon_TPREL_32_6_X:
+      Type = ELF::R_HEX_TPREL_32_6_X;
+      break;
+    case fixup_Hexagon_TPREL_16_X:
+      Type = ELF::R_HEX_TPREL_16_X;
+      break;
+    case fixup_Hexagon_TPREL_11_X:
+      Type = ELF::R_HEX_TPREL_11_X;
+      break;
   }
   return Type;
 }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
index 15cda717cf1c..36f81465eef6 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -28,7 +28,47 @@ using namespace llvm;
 #define GET_INSTRUCTION_NAME
 #include "HexagonGenAsmWriter.inc"
 
-const char HexagonInstPrinter::PacketPadding = '\t';
+HexagonAsmInstPrinter::HexagonAsmInstPrinter(MCInstPrinter *RawPrinter)
+    : MCInstPrinter(*RawPrinter), RawPrinter(RawPrinter) {}
+
+void HexagonAsmInstPrinter::printInst(MCInst const *MI, raw_ostream &O,
+                                      StringRef Annot,
+                                      MCSubtargetInfo const &STI) {
+  assert(HexagonMCInstrInfo::isBundle(*MI));
+  assert(HexagonMCInstrInfo::bundleSize(*MI) <= HEXAGON_PACKET_SIZE);
+  std::string Buffer;
+  {
+    raw_string_ostream TempStream(Buffer);
+    RawPrinter->printInst(MI, TempStream, "", STI);
+  }
+  StringRef Contents(Buffer);
+  auto PacketBundle = Contents.rsplit('\n');
+  auto HeadTail = PacketBundle.first.split('\n');
+  auto Preamble = "\t{\n\t\t";
+  auto Separator = "";
+  while(!HeadTail.first.empty()) {
+    O << Separator;
+    StringRef Inst;
+    auto Duplex = HeadTail.first.split('\v');
+    if(!Duplex.second.empty()){
+      O << Duplex.first << "\n";
+      Inst = Duplex.second;
+    }
+    else
+      Inst = Duplex.first;
+    O << Preamble;
+    O << Inst;
+    HeadTail = HeadTail.second.split('\n');
+    Preamble = "";
+    Separator = "\n\t\t";
+  }
+  O << "\n\t}" << PacketBundle.second;
+}
+
+void HexagonAsmInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
+  RawPrinter->printRegName(O, RegNo);
+}
+
 // Return the minimum value that a constant extendable operand can have
 // without being extended.
 static int getMinValue(uint64_t TSFlags) {
@@ -77,48 +117,44 @@ void HexagonInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   OS << getRegisterName(RegNo);
 }
 
-void HexagonInstPrinter::printInst(MCInst const *MI, raw_ostream &O,
-                                   StringRef Annot,
-                                   const MCSubtargetInfo &STI) {
-  const char startPacket = '{',
-             endPacket = '}';
-  // TODO: add outer HW loop when it's supported too.
-  if (MI->getOpcode() == Hexagon::ENDLOOP0) {
-    // Ending a harware loop is different from ending an regular packet.
-    assert(HexagonMCInstrInfo::isPacketEnd(*MI) && "Loop-end must also end the packet");
-
-    if (HexagonMCInstrInfo::isPacketBegin(*MI)) {
-      // There must be a packet to end a loop.
-      // FIXME: when shuffling is always run, this shouldn't be needed.
-      MCInst Nop;
-      StringRef NoAnnot;
-
-      Nop.setOpcode (Hexagon::A2_nop);
-      HexagonMCInstrInfo::setPacketBegin (Nop, HexagonMCInstrInfo::isPacketBegin(*MI));
-      printInst (&Nop, O, NoAnnot, STI);
-    }
+void HexagonInstPrinter::setExtender(MCInst const &MCI) {
+  HasExtender = HexagonMCInstrInfo::isImmext(MCI);
+}
 
-    // Close the packet.
-    if (HexagonMCInstrInfo::isPacketEnd(*MI))
-      O << PacketPadding << endPacket;
+void HexagonInstPrinter::printInst(MCInst const *MI, raw_ostream &OS,
+                                   StringRef Annot,
+                                   MCSubtargetInfo const &STI) {
+  assert(HexagonMCInstrInfo::isBundle(*MI));
+  assert(HexagonMCInstrInfo::bundleSize(*MI) <= HEXAGON_PACKET_SIZE);
+  HasExtender = false;
+  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MI)) {
+    MCInst const &MCI = *I.getInst();
+    if (HexagonMCInstrInfo::isDuplex(MII, MCI)) {
+      printInstruction(MCI.getOperand(1).getInst(), OS);
+      OS << '\v';
+      HasExtender = false;
+      printInstruction(MCI.getOperand(0).getInst(), OS);
+    } else
+      printInstruction(&MCI, OS);
+    setExtender(MCI);
+    OS << "\n";
+  }
 
-    printInstruction(MI, O);
+  auto Separator = "";
+  if (HexagonMCInstrInfo::isInnerLoop(*MI)) {
+    OS << Separator;
+    Separator = " ";
+    MCInst ME;
+    ME.setOpcode(Hexagon::ENDLOOP0);
+    printInstruction(&ME, OS);
   }
-  else {
-    // Prefix the insn opening the packet.
-    if (HexagonMCInstrInfo::isPacketBegin(*MI))
-      O << PacketPadding << startPacket << '\n';
-
-    printInstruction(MI, O);
-
-    // Suffix the insn closing the packet.
-    if (HexagonMCInstrInfo::isPacketEnd(*MI))
-      // Suffix the packet in a new line always, since the GNU assembler has
-      // issues with a closing brace on the same line as CONST{32,64}.
-      O << '\n' << PacketPadding << endPacket;
+  if (HexagonMCInstrInfo::isOuterLoop(*MI)) {
+    OS << Separator;
+    Separator = " ";
+    MCInst ME;
+    ME.setOpcode(Hexagon::ENDLOOP1);
+    printInstruction(&ME, OS);
   }
-
-  printAnnotation(O, Annot);
 }
 
 void HexagonInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
@@ -128,7 +164,7 @@ void HexagonInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   if (MO.isReg()) {
     printRegName(O, MO.getReg());
   } else if(MO.isExpr()) {
-    O << *MO.getExpr();
+    MO.getExpr()->print(O, &MAI);
   } else if(MO.isImm()) {
     printImmOperand(MI, OpNo, O);
   } else {
@@ -141,7 +177,7 @@ void HexagonInstPrinter::printImmOperand(const MCInst *MI, unsigned OpNo,
   const MCOperand& MO = MI->getOperand(OpNo);
 
   if(MO.isExpr()) {
-    O << *MO.getExpr();
+    MO.getExpr()->print(O, &MAI);
   } else if(MO.isImm()) {
     O << MI->getOperand(OpNo).getImm();
   } else {
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
index 3fedaed8fbf9..534ac237d635 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
@@ -18,6 +18,21 @@
 #include "llvm/MC/MCInstrInfo.h"
 
 namespace llvm {
+class HexagonAsmInstPrinter : public MCInstPrinter {
+public:
+  HexagonAsmInstPrinter(MCInstPrinter *RawPrinter);
+  void printInst(MCInst const *MI, raw_ostream &O, StringRef Annot,
+                 MCSubtargetInfo const &STI) override;
+  void printRegName(raw_ostream &O, unsigned RegNo) const override;
+  std::unique_ptr<MCInstPrinter> RawPrinter;
+};
+/// Prints bundles as a newline separated list of individual instructions
+/// Duplexes are separated by a vertical tab \v character
+/// A trailing line includes bundle properties such as endloop0/1
+///
+/// r0 = add(r1, r2)
+/// r0 = #0 \v jump 0x0
+/// :endloop0 :endloop1
   class HexagonInstPrinter : public MCInstPrinter {
   public:
     explicit HexagonInstPrinter(MCAsmInfo const &MAI,
@@ -74,11 +89,11 @@ namespace llvm {
     void printSymbol(const MCInst *MI, unsigned OpNo, raw_ostream &O, bool hi)
            const;
 
-    static const char PacketPadding;
-
   private:
     const MCInstrInfo &MII;
 
+    bool HasExtender;
+    void setExtender(MCInst const &MCI);
   };
 
 } // end namespace llvm
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index ad5e0fb15e7f..51d2f1c878dc 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -18,7 +18,7 @@ using namespace llvm;
 // Pin the vtable to this file.
 void HexagonMCAsmInfo::anchor() {}
 
-HexagonMCAsmInfo::HexagonMCAsmInfo(StringRef TT) {
+HexagonMCAsmInfo::HexagonMCAsmInfo(const Triple &TT) {
   Data16bitsDirective = "\t.half\t";
   Data32bitsDirective = "\t.word\t";
   Data64bitsDirective = nullptr;  // .xword is only supported by V9.
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
index ab18f0b37ba6..dc0706994786 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
@@ -18,10 +18,12 @@
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
+  class Triple;
+
   class HexagonMCAsmInfo : public MCAsmInfoELF {
     void anchor() override;
   public:
-    explicit HexagonMCAsmInfo(StringRef TT);
+    explicit HexagonMCAsmInfo(const Triple &TT);
   };
 
 } // namespace llvm
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index ae3953abba10..1eee852996fd 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "mccodeemitter"
@@ -31,38 +32,206 @@ using namespace Hexagon;
 
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
-namespace {
-/// \brief 10.6 Instruction Packets
-/// Possible values for instruction packet parse field.
-enum class ParseField { duplex = 0x0, last0 = 0x1, last1 = 0x2, end = 0x3 };
-/// \brief Returns the packet bits based on instruction position.
-uint32_t getPacketBits(MCInst const &HMI) {
-  unsigned const ParseFieldOffset = 14;
-  ParseField Field = HexagonMCInstrInfo::isPacketEnd(HMI) ? ParseField::end
-                                                          : ParseField::last0;
-  return static_cast<uint32_t>(Field) << ParseFieldOffset;
-}
-void emitLittleEndian(uint64_t Binary, raw_ostream &OS) {
-  OS << static_cast<uint8_t>((Binary >> 0x00) & 0xff);
-  OS << static_cast<uint8_t>((Binary >> 0x08) & 0xff);
-  OS << static_cast<uint8_t>((Binary >> 0x10) & 0xff);
-  OS << static_cast<uint8_t>((Binary >> 0x18) & 0xff);
-}
-}
-
 HexagonMCCodeEmitter::HexagonMCCodeEmitter(MCInstrInfo const &aMII,
                                            MCContext &aMCT)
     : MCT(aMCT), MCII(aMII), Addend(new unsigned(0)),
-      Extended(new bool(false)) {}
+      Extended(new bool(false)), CurrentBundle(new MCInst const *) {}
+
+uint32_t HexagonMCCodeEmitter::parseBits(size_t Instruction, size_t Last,
+                                         MCInst const &MCB,
+                                         MCInst const &MCI) const {
+  bool Duplex = HexagonMCInstrInfo::isDuplex(MCII, MCI);
+  if (Instruction == 0) {
+    if (HexagonMCInstrInfo::isInnerLoop(MCB)) {
+      assert(!Duplex);
+      assert(Instruction != Last);
+      return HexagonII::INST_PARSE_LOOP_END;
+    }
+  }
+  if (Instruction == 1) {
+    if (HexagonMCInstrInfo::isOuterLoop(MCB)) {
+      assert(!Duplex);
+      assert(Instruction != Last);
+      return HexagonII::INST_PARSE_LOOP_END;
+    }
+  }
+  if (Duplex) {
+    assert(Instruction == Last);
+    return HexagonII::INST_PARSE_DUPLEX;
+  }
+  if(Instruction == Last)
+    return HexagonII::INST_PARSE_PACKET_END;
+  return HexagonII::INST_PARSE_NOT_END;
+}
 
 void HexagonMCCodeEmitter::encodeInstruction(MCInst const &MI, raw_ostream &OS,
                                              SmallVectorImpl<MCFixup> &Fixups,
                                              MCSubtargetInfo const &STI) const {
-  uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI) | getPacketBits(MI);
-  assert(HexagonMCInstrInfo::getDesc(MCII, MI).getSize() == 4 &&
-         "All instructions should be 32bit");
-  (void)&MCII;
-  emitLittleEndian(Binary, OS);
+  MCInst &HMB = const_cast<MCInst &>(MI);
+
+  assert(HexagonMCInstrInfo::isBundle(HMB));
+  DEBUG(dbgs() << "Encoding bundle\n";);
+  *Addend = 0;
+  *Extended = false;
+  *CurrentBundle = &MI;
+  size_t Instruction = 0;
+  size_t Last = HexagonMCInstrInfo::bundleSize(HMB) - 1;
+  for (auto &I : HexagonMCInstrInfo::bundleInstructions(HMB)) {
+    MCInst &HMI = const_cast<MCInst &>(*I.getInst());
+    EncodeSingleInstruction(HMI, OS, Fixups, STI,
+                            parseBits(Instruction, Last, HMB, HMI),
+                            Instruction);
+    *Extended = HexagonMCInstrInfo::isImmext(HMI);
+    *Addend += HEXAGON_INSTR_SIZE;
+    ++Instruction;
+  }
+  return;
+}
+
+/// EncodeSingleInstruction - Emit a single
+void HexagonMCCodeEmitter::EncodeSingleInstruction(
+    const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI, uint32_t Parse, size_t Index) const {
+  MCInst HMB = MI;
+  assert(!HexagonMCInstrInfo::isBundle(HMB));
+  uint64_t Binary;
+
+  // Pseudo instructions don't get encoded and shouldn't be here
+  // in the first place!
+  assert(!HexagonMCInstrInfo::getDesc(MCII, HMB).isPseudo() &&
+         "pseudo-instruction found");
+  DEBUG(dbgs() << "Encoding insn"
+                  " `" << HexagonMCInstrInfo::getName(MCII, HMB) << "'"
+                                                                    "\n");
+
+  if (HexagonMCInstrInfo::isNewValue(MCII, HMB)) {
+    // Calculate the new value distance to the associated producer
+    MCOperand &MCO =
+        HMB.getOperand(HexagonMCInstrInfo::getNewValueOp(MCII, HMB));
+    unsigned SOffset = 0;
+    unsigned Register = MCO.getReg();
+    unsigned Register1;
+    auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
+    auto i = Instructions.begin() + Index - 1;
+    for (;; --i) {
+      assert(i != Instructions.begin() - 1 && "Couldn't find producer");
+      MCInst const &Inst = *i->getInst();
+      if (HexagonMCInstrInfo::isImmext(Inst))
+        continue;
+      ++SOffset;
+      Register1 =
+          HexagonMCInstrInfo::hasNewValue(MCII, Inst)
+              ? HexagonMCInstrInfo::getNewValueOperand(MCII, Inst).getReg()
+              : static_cast<unsigned>(Hexagon::NoRegister);
+      if (Register != Register1)
+        // This isn't the register we're looking for
+        continue;
+      if (!HexagonMCInstrInfo::isPredicated(MCII, Inst))
+        // Producer is unpredicated
+        break;
+      assert(HexagonMCInstrInfo::isPredicated(MCII, HMB) &&
+             "Unpredicated consumer depending on predicated producer");
+      if (HexagonMCInstrInfo::isPredicatedTrue(MCII, Inst) ==
+          HexagonMCInstrInfo::isPredicatedTrue(MCII, HMB))
+        // Producer predicate sense matched ours
+        break;
+    }
+    // Hexagon PRM 10.11 Construct Nt from distance
+    unsigned Offset = SOffset;
+    Offset <<= 1;
+    MCO.setReg(Offset + Hexagon::R0);
+  }
+
+  Binary = getBinaryCodeForInstr(HMB, Fixups, STI);
+  // Check for unimplemented instructions. Immediate extenders
+  // are encoded as zero, so they need to be accounted for.
+  if ((!Binary) &&
+      ((HMB.getOpcode() != DuplexIClass0) && (HMB.getOpcode() != A4_ext) &&
+       (HMB.getOpcode() != A4_ext_b) && (HMB.getOpcode() != A4_ext_c) &&
+       (HMB.getOpcode() != A4_ext_g))) {
+    // Use a A2_nop for unimplemented instructions.
+    DEBUG(dbgs() << "Unimplemented inst: "
+                    " `" << HexagonMCInstrInfo::getName(MCII, HMB) << "'"
+                                                                      "\n");
+    llvm_unreachable("Unimplemented Instruction");
+  }
+  Binary |= Parse;
+
+  // if we need to emit a duplexed instruction
+  if (HMB.getOpcode() >= Hexagon::DuplexIClass0 &&
+      HMB.getOpcode() <= Hexagon::DuplexIClassF) {
+    assert(Parse == HexagonII::INST_PARSE_DUPLEX &&
+           "Emitting duplex without duplex parse bits");
+    unsigned dupIClass;
+    switch (HMB.getOpcode()) {
+    case Hexagon::DuplexIClass0:
+      dupIClass = 0;
+      break;
+    case Hexagon::DuplexIClass1:
+      dupIClass = 1;
+      break;
+    case Hexagon::DuplexIClass2:
+      dupIClass = 2;
+      break;
+    case Hexagon::DuplexIClass3:
+      dupIClass = 3;
+      break;
+    case Hexagon::DuplexIClass4:
+      dupIClass = 4;
+      break;
+    case Hexagon::DuplexIClass5:
+      dupIClass = 5;
+      break;
+    case Hexagon::DuplexIClass6:
+      dupIClass = 6;
+      break;
+    case Hexagon::DuplexIClass7:
+      dupIClass = 7;
+      break;
+    case Hexagon::DuplexIClass8:
+      dupIClass = 8;
+      break;
+    case Hexagon::DuplexIClass9:
+      dupIClass = 9;
+      break;
+    case Hexagon::DuplexIClassA:
+      dupIClass = 10;
+      break;
+    case Hexagon::DuplexIClassB:
+      dupIClass = 11;
+      break;
+    case Hexagon::DuplexIClassC:
+      dupIClass = 12;
+      break;
+    case Hexagon::DuplexIClassD:
+      dupIClass = 13;
+      break;
+    case Hexagon::DuplexIClassE:
+      dupIClass = 14;
+      break;
+    case Hexagon::DuplexIClassF:
+      dupIClass = 15;
+      break;
+    default:
+      llvm_unreachable("Unimplemented DuplexIClass");
+      break;
+    }
+    // 29 is the bit position.
+    // 0b1110 =0xE bits are masked off and down shifted by 1 bit.
+    // Last bit is moved to bit position 13
+    Binary = ((dupIClass & 0xE) << (29 - 1)) | ((dupIClass & 0x1) << 13);
+
+    const MCInst *subInst0 = HMB.getOperand(0).getInst();
+    const MCInst *subInst1 = HMB.getOperand(1).getInst();
+
+    // get subinstruction slot 0
+    unsigned subInstSlot0Bits = getBinaryCodeForInstr(*subInst0, Fixups, STI);
+    // get subinstruction slot 1
+    unsigned subInstSlot1Bits = getBinaryCodeForInstr(*subInst1, Fixups, STI);
+
+    Binary |= subInstSlot0Bits | (subInstSlot1Bits << 16);
+  }
+  support::endian::Writer<support::little>(OS).write<uint32_t>(Binary);
   ++MCNumEmitted;
 }
 
@@ -182,7 +351,7 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
 {
   int64_t Res;
 
-  if (ME->EvaluateAsAbsolute(Res))
+  if (ME->evaluateAsAbsolute(Res))
     return Res;
 
   MCExpr::ExprKind MK = ME->getKind();
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
index 939380af1013..9aa258cee4c6 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
@@ -30,6 +30,7 @@ class HexagonMCCodeEmitter : public MCCodeEmitter {
   MCInstrInfo const &MCII;
   std::unique_ptr<unsigned> Addend;
   std::unique_ptr<bool> Extended;
+  std::unique_ptr<MCInst const *> CurrentBundle;
 
   // helper routine for getMachineOpValue()
   unsigned getExprOpValue(const MCInst &MI, const MCOperand &MO,
@@ -39,12 +40,21 @@ class HexagonMCCodeEmitter : public MCCodeEmitter {
 public:
   HexagonMCCodeEmitter(MCInstrInfo const &aMII, MCContext &aMCT);
 
+  // Return parse bits for instruction `MCI' inside bundle `MCB'
+  uint32_t parseBits(size_t Instruction, size_t Last, MCInst const &MCB,
+                    MCInst const &MCI) const;
+
   MCSubtargetInfo const &getSubtargetInfo() const;
 
   void encodeInstruction(MCInst const &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
                          MCSubtargetInfo const &STI) const override;
 
+  void EncodeSingleInstruction(const MCInst &MI, raw_ostream &OS,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI,
+                               uint32_t Parse, size_t Index) const;
+
   // \brief TableGen'erated function for getting the
   // binary encoding for an instruction.
   uint64_t getBinaryCodeForInstr(MCInst const &MI,
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
new file mode 100644
index 000000000000..108093547f82
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
@@ -0,0 +1,420 @@
+
+//=== HexagonMCCompound.cpp - Hexagon Compound checker  -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is looks at a packet and tries to form compound insns
+//
+//===----------------------------------------------------------------------===//
+#include "Hexagon.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCShuffler.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace Hexagon;
+
+#define DEBUG_TYPE "hexagon-mccompound"
+
+enum OpcodeIndex {
+  fp0_jump_nt = 0,
+  fp0_jump_t,
+  fp1_jump_nt,
+  fp1_jump_t,
+  tp0_jump_nt,
+  tp0_jump_t,
+  tp1_jump_nt,
+  tp1_jump_t
+};
+
+unsigned tstBitOpcode[8] = {J4_tstbit0_fp0_jump_nt, J4_tstbit0_fp0_jump_t,
+                            J4_tstbit0_fp1_jump_nt, J4_tstbit0_fp1_jump_t,
+                            J4_tstbit0_tp0_jump_nt, J4_tstbit0_tp0_jump_t,
+                            J4_tstbit0_tp1_jump_nt, J4_tstbit0_tp1_jump_t};
+unsigned cmpeqBitOpcode[8] = {J4_cmpeq_fp0_jump_nt, J4_cmpeq_fp0_jump_t,
+                              J4_cmpeq_fp1_jump_nt, J4_cmpeq_fp1_jump_t,
+                              J4_cmpeq_tp0_jump_nt, J4_cmpeq_tp0_jump_t,
+                              J4_cmpeq_tp1_jump_nt, J4_cmpeq_tp1_jump_t};
+unsigned cmpgtBitOpcode[8] = {J4_cmpgt_fp0_jump_nt, J4_cmpgt_fp0_jump_t,
+                              J4_cmpgt_fp1_jump_nt, J4_cmpgt_fp1_jump_t,
+                              J4_cmpgt_tp0_jump_nt, J4_cmpgt_tp0_jump_t,
+                              J4_cmpgt_tp1_jump_nt, J4_cmpgt_tp1_jump_t};
+unsigned cmpgtuBitOpcode[8] = {J4_cmpgtu_fp0_jump_nt, J4_cmpgtu_fp0_jump_t,
+                               J4_cmpgtu_fp1_jump_nt, J4_cmpgtu_fp1_jump_t,
+                               J4_cmpgtu_tp0_jump_nt, J4_cmpgtu_tp0_jump_t,
+                               J4_cmpgtu_tp1_jump_nt, J4_cmpgtu_tp1_jump_t};
+unsigned cmpeqiBitOpcode[8] = {J4_cmpeqi_fp0_jump_nt, J4_cmpeqi_fp0_jump_t,
+                               J4_cmpeqi_fp1_jump_nt, J4_cmpeqi_fp1_jump_t,
+                               J4_cmpeqi_tp0_jump_nt, J4_cmpeqi_tp0_jump_t,
+                               J4_cmpeqi_tp1_jump_nt, J4_cmpeqi_tp1_jump_t};
+unsigned cmpgtiBitOpcode[8] = {J4_cmpgti_fp0_jump_nt, J4_cmpgti_fp0_jump_t,
+                               J4_cmpgti_fp1_jump_nt, J4_cmpgti_fp1_jump_t,
+                               J4_cmpgti_tp0_jump_nt, J4_cmpgti_tp0_jump_t,
+                               J4_cmpgti_tp1_jump_nt, J4_cmpgti_tp1_jump_t};
+unsigned cmpgtuiBitOpcode[8] = {J4_cmpgtui_fp0_jump_nt, J4_cmpgtui_fp0_jump_t,
+                                J4_cmpgtui_fp1_jump_nt, J4_cmpgtui_fp1_jump_t,
+                                J4_cmpgtui_tp0_jump_nt, J4_cmpgtui_tp0_jump_t,
+                                J4_cmpgtui_tp1_jump_nt, J4_cmpgtui_tp1_jump_t};
+unsigned cmpeqn1BitOpcode[8] = {J4_cmpeqn1_fp0_jump_nt, J4_cmpeqn1_fp0_jump_t,
+                                J4_cmpeqn1_fp1_jump_nt, J4_cmpeqn1_fp1_jump_t,
+                                J4_cmpeqn1_tp0_jump_nt, J4_cmpeqn1_tp0_jump_t,
+                                J4_cmpeqn1_tp1_jump_nt, J4_cmpeqn1_tp1_jump_t};
+unsigned cmpgtn1BitOpcode[8] = {
+    J4_cmpgtn1_fp0_jump_nt, J4_cmpgtn1_fp0_jump_t,  J4_cmpgtn1_fp1_jump_nt,
+    J4_cmpgtn1_fp1_jump_t,  J4_cmpgtn1_tp0_jump_nt, J4_cmpgtn1_tp0_jump_t,
+    J4_cmpgtn1_tp1_jump_nt, J4_cmpgtn1_tp1_jump_t,
+};
+
+// enum HexagonII::CompoundGroup
+namespace {
+unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) {
+  unsigned DstReg, SrcReg, Src1Reg, Src2Reg;
+
+  switch (MI.getOpcode()) {
+  default:
+    return HexagonII::HCG_None;
+  //
+  // Compound pairs.
+  // "p0=cmp.eq(Rs16,Rt16); if (p0.new) jump:nt #r9:2"
+  // "Rd16=#U6 ; jump #r9:2"
+  // "Rd16=Rs16 ; jump #r9:2"
+  //
+  case Hexagon::C2_cmpeq:
+  case Hexagon::C2_cmpgt:
+  case Hexagon::C2_cmpgtu:
+    if (IsExtended)
+      return false;
+    DstReg = MI.getOperand(0).getReg();
+    Src1Reg = MI.getOperand(1).getReg();
+    Src2Reg = MI.getOperand(2).getReg();
+    if ((Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg))
+      return HexagonII::HCG_A;
+    break;
+  case Hexagon::C2_cmpeqi:
+  case Hexagon::C2_cmpgti:
+  case Hexagon::C2_cmpgtui:
+    if (IsExtended)
+      return false;
+    // P0 = cmp.eq(Rs,#u2)
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    if ((Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+        MI.getOperand(2).isImm() && ((isUInt<5>(MI.getOperand(2).getImm())) ||
+                                     (MI.getOperand(2).getImm() == -1)))
+      return HexagonII::HCG_A;
+    break;
+  case Hexagon::A2_tfr:
+    if (IsExtended)
+      return false;
+    // Rd = Rs
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg))
+      return HexagonII::HCG_A;
+    break;
+  case Hexagon::A2_tfrsi:
+    if (IsExtended)
+      return false;
+    // Rd = #u6
+    DstReg = MI.getOperand(0).getReg();
+    if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() <= 63 &&
+        MI.getOperand(1).getImm() >= 0 &&
+        HexagonMCInstrInfo::isIntRegForSubInst(DstReg))
+      return HexagonII::HCG_A;
+    break;
+  case Hexagon::S2_tstbit_i:
+    if (IsExtended)
+      return false;
+    DstReg = MI.getOperand(0).getReg();
+    Src1Reg = MI.getOperand(1).getReg();
+    if ((Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) &&
+        MI.getOperand(2).isImm() &&
+        HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+        (MI.getOperand(2).getImm() == 0))
+      return HexagonII::HCG_A;
+    break;
+  // The fact that .new form is used pretty much guarantees
+  // that predicate register will match. Nevertheless,
+  // there could be some false positives without additional
+  // checking.
+  case Hexagon::J2_jumptnew:
+  case Hexagon::J2_jumpfnew:
+  case Hexagon::J2_jumptnewpt:
+  case Hexagon::J2_jumpfnewpt:
+    Src1Reg = MI.getOperand(0).getReg();
+    if (Hexagon::P0 == Src1Reg || Hexagon::P1 == Src1Reg)
+      return HexagonII::HCG_B;
+    break;
+  // Transfer and jump:
+  // Rd=#U6 ; jump #r9:2
+  // Rd=Rs ; jump #r9:2
+  // Do not test for jump range here.
+  case Hexagon::J2_jump:
+  case Hexagon::RESTORE_DEALLOC_RET_JMP_V4:
+    return HexagonII::HCG_C;
+    break;
+  }
+
+  return HexagonII::HCG_None;
+}
+}
+
+/// getCompoundOp - Return the index from 0-7 into the above opcode lists.
+namespace {
+unsigned getCompoundOp(MCInst const &HMCI) {
+  const MCOperand &Predicate = HMCI.getOperand(0);
+  unsigned PredReg = Predicate.getReg();
+
+  assert((PredReg == Hexagon::P0) || (PredReg == Hexagon::P1) ||
+         (PredReg == Hexagon::P2) || (PredReg == Hexagon::P3));
+
+  switch (HMCI.getOpcode()) {
+  default:
+    llvm_unreachable("Expected match not found.\n");
+    break;
+  case Hexagon::J2_jumpfnew:
+    return (PredReg == Hexagon::P0) ? fp0_jump_nt : fp1_jump_nt;
+  case Hexagon::J2_jumpfnewpt:
+    return (PredReg == Hexagon::P0) ? fp0_jump_t : fp1_jump_t;
+  case Hexagon::J2_jumptnew:
+    return (PredReg == Hexagon::P0) ? tp0_jump_nt : tp1_jump_nt;
+  case Hexagon::J2_jumptnewpt:
+    return (PredReg == Hexagon::P0) ? tp0_jump_t : tp1_jump_t;
+  }
+}
+}
+
+namespace {
+MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) {
+  MCInst *CompoundInsn = 0;
+  unsigned compoundOpcode;
+  MCOperand Rs, Rt;
+
+  switch (L.getOpcode()) {
+  default:
+    DEBUG(dbgs() << "Possible compound ignored\n");
+    return CompoundInsn;
+
+  case Hexagon::A2_tfrsi:
+    Rt = L.getOperand(0);
+    compoundOpcode = J4_jumpseti;
+    CompoundInsn = new (Context) MCInst;
+    CompoundInsn->setOpcode(compoundOpcode);
+
+    CompoundInsn->addOperand(Rt);
+    CompoundInsn->addOperand(L.getOperand(1)); // Immediate
+    CompoundInsn->addOperand(R.getOperand(0)); // Jump target
+    break;
+
+  case Hexagon::A2_tfr:
+    Rt = L.getOperand(0);
+    Rs = L.getOperand(1);
+
+    compoundOpcode = J4_jumpsetr;
+    CompoundInsn = new (Context) MCInst;
+    CompoundInsn->setOpcode(compoundOpcode);
+    CompoundInsn->addOperand(Rt);
+    CompoundInsn->addOperand(Rs);
+    CompoundInsn->addOperand(R.getOperand(0)); // Jump target.
+
+    break;
+
+  case Hexagon::C2_cmpeq:
+    DEBUG(dbgs() << "CX: C2_cmpeq\n");
+    Rs = L.getOperand(1);
+    Rt = L.getOperand(2);
+
+    compoundOpcode = cmpeqBitOpcode[getCompoundOp(R)];
+    CompoundInsn = new (Context) MCInst;
+    CompoundInsn->setOpcode(compoundOpcode);
+    CompoundInsn->addOperand(Rs);
+    CompoundInsn->addOperand(Rt);
+    CompoundInsn->addOperand(R.getOperand(1));
+    break;
+
+  case Hexagon::C2_cmpgt:
+    DEBUG(dbgs() << "CX: C2_cmpgt\n");
+    Rs = L.getOperand(1);
+    Rt = L.getOperand(2);
+
+    compoundOpcode = cmpgtBitOpcode[getCompoundOp(R)];
+    CompoundInsn = new (Context) MCInst;
+    CompoundInsn->setOpcode(compoundOpcode);
+    CompoundInsn->addOperand(Rs);
+    CompoundInsn->addOperand(Rt);
+    CompoundInsn->addOperand(R.getOperand(1));
+    break;
+
+  case Hexagon::C2_cmpgtu:
+    DEBUG(dbgs() << "CX: C2_cmpgtu\n");
+    Rs = L.getOperand(1);
+    Rt = L.getOperand(2);
+
+    compoundOpcode = cmpgtuBitOpcode[getCompoundOp(R)];
+    CompoundInsn = new (Context) MCInst;
+    CompoundInsn->setOpcode(compoundOpcode);
+    CompoundInsn->addOperand(Rs);
+    CompoundInsn->addOperand(Rt);
+    CompoundInsn->addOperand(R.getOperand(1));
+    break;
+
+  case Hexagon::C2_cmpeqi:
+    DEBUG(dbgs() << "CX: C2_cmpeqi\n");
+    if (L.getOperand(2).getImm() == -1)
+      compoundOpcode = cmpeqn1BitOpcode[getCompoundOp(R)];
+    else
+      compoundOpcode = cmpeqiBitOpcode[getCompoundOp(R)];
+
+    Rs = L.getOperand(1);
+    CompoundInsn = new (Context) MCInst;
+    CompoundInsn->setOpcode(compoundOpcode);
+    CompoundInsn->addOperand(Rs);
+    if (L.getOperand(2).getImm() != -1)
+      CompoundInsn->addOperand(L.getOperand(2));
+    CompoundInsn->addOperand(R.getOperand(1));
+    break;
+
+  case Hexagon::C2_cmpgti:
+    DEBUG(dbgs() << "CX: C2_cmpgti\n");
+    if (L.getOperand(2).getImm() == -1)
+      compoundOpcode = cmpgtn1BitOpcode[getCompoundOp(R)];
+    else
+      compoundOpcode = cmpgtiBitOpcode[getCompoundOp(R)];
+
+    Rs = L.getOperand(1);
+    CompoundInsn = new (Context) MCInst;
+    CompoundInsn->setOpcode(compoundOpcode);
+    CompoundInsn->addOperand(Rs);
+    if (L.getOperand(2).getImm() != -1)
+      CompoundInsn->addOperand(L.getOperand(2));
+    CompoundInsn->addOperand(R.getOperand(1));
+    break;
+
+  case Hexagon::C2_cmpgtui:
+    DEBUG(dbgs() << "CX: C2_cmpgtui\n");
+    Rs = L.getOperand(1);
+    compoundOpcode = cmpgtuiBitOpcode[getCompoundOp(R)];
+    CompoundInsn = new (Context) MCInst;
+    CompoundInsn->setOpcode(compoundOpcode);
+    CompoundInsn->addOperand(Rs);
+    CompoundInsn->addOperand(L.getOperand(2));
+    CompoundInsn->addOperand(R.getOperand(1));
+    break;
+
+  case Hexagon::S2_tstbit_i:
+    DEBUG(dbgs() << "CX: S2_tstbit_i\n");
+    Rs = L.getOperand(1);
+    compoundOpcode = tstBitOpcode[getCompoundOp(R)];
+    CompoundInsn = new (Context) MCInst;
+    CompoundInsn->setOpcode(compoundOpcode);
+    CompoundInsn->addOperand(Rs);
+    CompoundInsn->addOperand(R.getOperand(1));
+    break;
+  }
+
+  return CompoundInsn;
+}
+}
+
+/// Non-Symmetrical. See if these two instructions are fit for compound pair.
+namespace {
+bool isOrderedCompoundPair(MCInst const &MIa, bool IsExtendedA,
+                           MCInst const &MIb, bool IsExtendedB) {
+  unsigned MIaG = getCompoundCandidateGroup(MIa, IsExtendedA);
+  unsigned MIbG = getCompoundCandidateGroup(MIb, IsExtendedB);
+  // We have two candidates - check that this is the same register
+  // we are talking about.
+  unsigned Opca = MIa.getOpcode();
+  if (MIaG == HexagonII::HCG_A && MIbG == HexagonII::HCG_C &&
+      (Opca == Hexagon::A2_tfr || Opca == Hexagon::A2_tfrsi))
+    return true;
+  return ((MIaG == HexagonII::HCG_A && MIbG == HexagonII::HCG_B) &&
+          (MIa.getOperand(0).getReg() == MIb.getOperand(0).getReg()));
+}
+}
+
+namespace {
+bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI) {
+  assert(HexagonMCInstrInfo::isBundle(MCI));
+  bool JExtended = false;
+  for (MCInst::iterator J =
+           MCI.begin() + HexagonMCInstrInfo::bundleInstructionsOffset;
+       J != MCI.end(); ++J) {
+    MCInst const *JumpInst = J->getInst();
+    if (HexagonMCInstrInfo::isImmext(*JumpInst)) {
+      JExtended = true;
+      continue;
+    }
+    if (llvm::HexagonMCInstrInfo::getType(MCII, *JumpInst) ==
+        HexagonII::TypeJ) {
+      // Try to pair with another insn (B)undled with jump.
+      bool BExtended = false;
+      for (MCInst::iterator B =
+               MCI.begin() + HexagonMCInstrInfo::bundleInstructionsOffset;
+           B != MCI.end(); ++B) {
+        MCInst const *Inst = B->getInst();
+        if (JumpInst == Inst)
+          continue;
+        if (HexagonMCInstrInfo::isImmext(*Inst)) {
+          BExtended = true;
+          continue;
+        }
+        DEBUG(dbgs() << "J,B: " << JumpInst->getOpcode() << ","
+                     << Inst->getOpcode() << "\n");
+        if (isOrderedCompoundPair(*Inst, BExtended, *JumpInst, JExtended)) {
+          MCInst *CompoundInsn = getCompoundInsn(Context, *Inst, *JumpInst);
+          if (CompoundInsn) {
+            DEBUG(dbgs() << "B: " << Inst->getOpcode() << ","
+                         << JumpInst->getOpcode() << " Compounds to "
+                         << CompoundInsn->getOpcode() << "\n");
+            J->setInst(CompoundInsn);
+            MCI.erase(B);
+            return true;
+          }
+        }
+        BExtended = false;
+      }
+    }
+    JExtended = false;
+  }
+  return false;
+}
+}
+
+/// tryCompound - Given a bundle check for compound insns when one
+/// is found update the contents fo the bundle with the compound insn.
+/// If a compound instruction is found then the bundle will have one
+/// additional slot.
+void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII,
+                                     MCContext &Context, MCInst &MCI) {
+  assert(MCI.getOpcode() == Hexagon::BUNDLE &&
+         "Non-Bundle where Bundle expected");
+
+  // By definition a compound must have 2 insn.
+  if (MCI.size() < 2)
+    return;
+
+  // Look for compounds until none are found, only update the bundle when
+  // a compound is found.
+  while (lookForCompound(MCII, Context, MCI))
+    ;
+
+  return;
+}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
new file mode 100644
index 000000000000..eb629774a2cd
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -0,0 +1,1100 @@
+//===----- HexagonMCDuplexInfo.cpp - Instruction bundle checking ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements duplexing of instructions to reduce code size
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <map>
+
+using namespace llvm;
+using namespace Hexagon;
+
+#define DEBUG_TYPE "hexagon-mcduplex-info"
+
+// pair table of subInstructions with opcodes
+static std::pair<unsigned, unsigned> opcodeData[] = {
+    std::make_pair((unsigned)V4_SA1_addi, 0),
+    std::make_pair((unsigned)V4_SA1_addrx, 6144),
+    std::make_pair((unsigned)V4_SA1_addsp, 3072),
+    std::make_pair((unsigned)V4_SA1_and1, 4608),
+    std::make_pair((unsigned)V4_SA1_clrf, 6768),
+    std::make_pair((unsigned)V4_SA1_clrfnew, 6736),
+    std::make_pair((unsigned)V4_SA1_clrt, 6752),
+    std::make_pair((unsigned)V4_SA1_clrtnew, 6720),
+    std::make_pair((unsigned)V4_SA1_cmpeqi, 6400),
+    std::make_pair((unsigned)V4_SA1_combine0i, 7168),
+    std::make_pair((unsigned)V4_SA1_combine1i, 7176),
+    std::make_pair((unsigned)V4_SA1_combine2i, 7184),
+    std::make_pair((unsigned)V4_SA1_combine3i, 7192),
+    std::make_pair((unsigned)V4_SA1_combinerz, 7432),
+    std::make_pair((unsigned)V4_SA1_combinezr, 7424),
+    std::make_pair((unsigned)V4_SA1_dec, 4864),
+    std::make_pair((unsigned)V4_SA1_inc, 4352),
+    std::make_pair((unsigned)V4_SA1_seti, 2048),
+    std::make_pair((unsigned)V4_SA1_setin1, 6656),
+    std::make_pair((unsigned)V4_SA1_sxtb, 5376),
+    std::make_pair((unsigned)V4_SA1_sxth, 5120),
+    std::make_pair((unsigned)V4_SA1_tfr, 4096),
+    std::make_pair((unsigned)V4_SA1_zxtb, 5888),
+    std::make_pair((unsigned)V4_SA1_zxth, 5632),
+    std::make_pair((unsigned)V4_SL1_loadri_io, 0),
+    std::make_pair((unsigned)V4_SL1_loadrub_io, 4096),
+    std::make_pair((unsigned)V4_SL2_deallocframe, 7936),
+    std::make_pair((unsigned)V4_SL2_jumpr31, 8128),
+    std::make_pair((unsigned)V4_SL2_jumpr31_f, 8133),
+    std::make_pair((unsigned)V4_SL2_jumpr31_fnew, 8135),
+    std::make_pair((unsigned)V4_SL2_jumpr31_t, 8132),
+    std::make_pair((unsigned)V4_SL2_jumpr31_tnew, 8134),
+    std::make_pair((unsigned)V4_SL2_loadrb_io, 4096),
+    std::make_pair((unsigned)V4_SL2_loadrd_sp, 7680),
+    std::make_pair((unsigned)V4_SL2_loadrh_io, 0),
+    std::make_pair((unsigned)V4_SL2_loadri_sp, 7168),
+    std::make_pair((unsigned)V4_SL2_loadruh_io, 2048),
+    std::make_pair((unsigned)V4_SL2_return, 8000),
+    std::make_pair((unsigned)V4_SL2_return_f, 8005),
+    std::make_pair((unsigned)V4_SL2_return_fnew, 8007),
+    std::make_pair((unsigned)V4_SL2_return_t, 8004),
+    std::make_pair((unsigned)V4_SL2_return_tnew, 8006),
+    std::make_pair((unsigned)V4_SS1_storeb_io, 4096),
+    std::make_pair((unsigned)V4_SS1_storew_io, 0),
+    std::make_pair((unsigned)V4_SS2_allocframe, 7168),
+    std::make_pair((unsigned)V4_SS2_storebi0, 4608),
+    std::make_pair((unsigned)V4_SS2_storebi1, 4864),
+    std::make_pair((unsigned)V4_SS2_stored_sp, 2560),
+    std::make_pair((unsigned)V4_SS2_storeh_io, 0),
+    std::make_pair((unsigned)V4_SS2_storew_sp, 2048),
+    std::make_pair((unsigned)V4_SS2_storewi0, 4096),
+    std::make_pair((unsigned)V4_SS2_storewi1, 4352)};
+
+static std::map<unsigned, unsigned>
+    subinstOpcodeMap(opcodeData,
+                     opcodeData + sizeof(opcodeData) / sizeof(opcodeData[0]));
+
+bool HexagonMCInstrInfo::isDuplexPairMatch(unsigned Ga, unsigned Gb) {
+  switch (Ga) {
+  case HexagonII::HSIG_None:
+  default:
+    return false;
+  case HexagonII::HSIG_L1:
+    return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_L2:
+    return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+            Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_S1:
+    return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+            Gb == HexagonII::HSIG_S1 || Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_S2:
+    return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 ||
+            Gb == HexagonII::HSIG_S1 || Gb == HexagonII::HSIG_S2 ||
+            Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_A:
+    return (Gb == HexagonII::HSIG_A);
+  case HexagonII::HSIG_Compound:
+    return (Gb == HexagonII::HSIG_Compound);
+  }
+  return false;
+}
+
+unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) {
+  switch (Ga) {
+  case HexagonII::HSIG_None:
+  default:
+    break;
+  case HexagonII::HSIG_L1:
+    switch (Gb) {
+    default:
+      break;
+    case HexagonII::HSIG_L1:
+      return 0;
+    case HexagonII::HSIG_A:
+      return 0x4;
+    }
+  case HexagonII::HSIG_L2:
+    switch (Gb) {
+    default:
+      break;
+    case HexagonII::HSIG_L1:
+      return 0x1;
+    case HexagonII::HSIG_L2:
+      return 0x2;
+    case HexagonII::HSIG_A:
+      return 0x5;
+    }
+  case HexagonII::HSIG_S1:
+    switch (Gb) {
+    default:
+      break;
+    case HexagonII::HSIG_L1:
+      return 0x8;
+    case HexagonII::HSIG_L2:
+      return 0x9;
+    case HexagonII::HSIG_S1:
+      return 0xA;
+    case HexagonII::HSIG_A:
+      return 0x6;
+    }
+  case HexagonII::HSIG_S2:
+    switch (Gb) {
+    default:
+      break;
+    case HexagonII::HSIG_L1:
+      return 0xC;
+    case HexagonII::HSIG_L2:
+      return 0xD;
+    case HexagonII::HSIG_S1:
+      return 0xB;
+    case HexagonII::HSIG_S2:
+      return 0xE;
+    case HexagonII::HSIG_A:
+      return 0x7;
+    }
+  case HexagonII::HSIG_A:
+    switch (Gb) {
+    default:
+      break;
+    case HexagonII::HSIG_A:
+      return 0x3;
+    }
+  case HexagonII::HSIG_Compound:
+    switch (Gb) {
+    case HexagonII::HSIG_Compound:
+      return 0xFFFFFFFF;
+    }
+  }
+  return 0xFFFFFFFF;
+}
+
+unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
+  unsigned DstReg, PredReg, SrcReg, Src1Reg, Src2Reg;
+
+  switch (MCI.getOpcode()) {
+  default:
+    return HexagonII::HSIG_None;
+  //
+  // Group L1:
+  //
+  // Rd = memw(Rs+#u4:2)
+  // Rd = memub(Rs+#u4:0)
+  case Hexagon::L2_loadri_io:
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    // Special case this one from Group L2.
+    // Rd = memw(r29+#u5:2)
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
+      if (HexagonMCInstrInfo::isIntReg(SrcReg) && Hexagon::R29 == SrcReg &&
+          MCI.getOperand(2).isImm() &&
+          isShiftedUInt<5, 2>(MCI.getOperand(2).getImm())) {
+        return HexagonII::HSIG_L2;
+      }
+      // Rd = memw(Rs+#u4:2)
+      if (HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+          (MCI.getOperand(2).isImm() &&
+           isShiftedUInt<4, 2>(MCI.getOperand(2).getImm()))) {
+        return HexagonII::HSIG_L1;
+      }
+    }
+    break;
+  case Hexagon::L2_loadrub_io:
+    // Rd = memub(Rs+#u4:0)
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+        MCI.getOperand(2).isImm() && isUInt<4>(MCI.getOperand(2).getImm())) {
+      return HexagonII::HSIG_L1;
+    }
+    break;
+  //
+  // Group L2:
+  //
+  // Rd = memh/memuh(Rs+#u3:1)
+  // Rd = memb(Rs+#u3:0)
+  // Rd = memw(r29+#u5:2) - Handled above.
+  // Rdd = memd(r29+#u5:3)
+  // deallocframe
+  // [if ([!]p0[.new])] dealloc_return
+  // [if ([!]p0[.new])] jumpr r31
+  case Hexagon::L2_loadrh_io:
+  case Hexagon::L2_loadruh_io:
+    // Rd = memh/memuh(Rs+#u3:1)
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+        MCI.getOperand(2).isImm() &&
+        isShiftedUInt<3, 1>(MCI.getOperand(2).getImm())) {
+      return HexagonII::HSIG_L2;
+    }
+    break;
+  case Hexagon::L2_loadrb_io:
+    // Rd = memb(Rs+#u3:0)
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+        MCI.getOperand(2).isImm() && isUInt<3>(MCI.getOperand(2).getImm())) {
+      return HexagonII::HSIG_L2;
+    }
+    break;
+  case Hexagon::L2_loadrd_io:
+    // Rdd = memd(r29+#u5:3)
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntReg(SrcReg) && Hexagon::R29 == SrcReg &&
+        MCI.getOperand(2).isImm() &&
+        isShiftedUInt<5, 3>(MCI.getOperand(2).getImm())) {
+      return HexagonII::HSIG_L2;
+    }
+    break;
+
+  case Hexagon::L4_return:
+
+  case Hexagon::L2_deallocframe:
+
+    return HexagonII::HSIG_L2;
+  case Hexagon::EH_RETURN_JMPR:
+
+  case Hexagon::J2_jumpr:
+  case Hexagon::JMPret:
+    // jumpr r31
+    // Actual form JMPR %PC<imp-def>, %R31<imp-use>, %R0<imp-use,internal>.
+    DstReg = MCI.getOperand(0).getReg();
+    if (Hexagon::R31 == DstReg) {
+      return HexagonII::HSIG_L2;
+    }
+    break;
+
+  case Hexagon::J2_jumprt:
+  case Hexagon::J2_jumprf:
+  case Hexagon::J2_jumprtnew:
+  case Hexagon::J2_jumprfnew:
+  case Hexagon::JMPrett:
+  case Hexagon::JMPretf:
+  case Hexagon::JMPrettnew:
+  case Hexagon::JMPretfnew:
+  case Hexagon::JMPrettnewpt:
+  case Hexagon::JMPretfnewpt:
+    DstReg = MCI.getOperand(1).getReg();
+    SrcReg = MCI.getOperand(0).getReg();
+    // [if ([!]p0[.new])] jumpr r31
+    if ((HexagonMCInstrInfo::isPredReg(SrcReg) && (Hexagon::P0 == SrcReg)) &&
+        (Hexagon::R31 == DstReg)) {
+      return HexagonII::HSIG_L2;
+    }
+    break;
+  case Hexagon::L4_return_t:
+
+  case Hexagon::L4_return_f:
+
+  case Hexagon::L4_return_tnew_pnt:
+
+  case Hexagon::L4_return_fnew_pnt:
+
+  case Hexagon::L4_return_tnew_pt:
+
+  case Hexagon::L4_return_fnew_pt:
+    // [if ([!]p0[.new])] dealloc_return
+    SrcReg = MCI.getOperand(0).getReg();
+    if (Hexagon::P0 == SrcReg) {
+      return HexagonII::HSIG_L2;
+    }
+    break;
+  //
+  // Group S1:
+  //
+  // memw(Rs+#u4:2) = Rt
+  // memb(Rs+#u4:0) = Rt
+  case Hexagon::S2_storeri_io:
+    // Special case this one from Group S2.
+    // memw(r29+#u5:2) = Rt
+    Src1Reg = MCI.getOperand(0).getReg();
+    Src2Reg = MCI.getOperand(2).getReg();
+    if (HexagonMCInstrInfo::isIntReg(Src1Reg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
+        Hexagon::R29 == Src1Reg && MCI.getOperand(1).isImm() &&
+        isShiftedUInt<5, 2>(MCI.getOperand(1).getImm())) {
+      return HexagonII::HSIG_S2;
+    }
+    // memw(Rs+#u4:2) = Rt
+    if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
+        MCI.getOperand(1).isImm() &&
+        isShiftedUInt<4, 2>(MCI.getOperand(1).getImm())) {
+      return HexagonII::HSIG_S1;
+    }
+    break;
+  case Hexagon::S2_storerb_io:
+    // memb(Rs+#u4:0) = Rt
+    Src1Reg = MCI.getOperand(0).getReg();
+    Src2Reg = MCI.getOperand(2).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
+        MCI.getOperand(1).isImm() && isUInt<4>(MCI.getOperand(1).getImm())) {
+      return HexagonII::HSIG_S1;
+    }
+    break;
+  //
+  // Group S2:
+  //
+  // memh(Rs+#u3:1) = Rt
+  // memw(r29+#u5:2) = Rt
+  // memd(r29+#s6:3) = Rtt
+  // memw(Rs+#u4:2) = #U1
+  // memb(Rs+#u4) = #U1
+  // allocframe(#u5:3)
+  case Hexagon::S2_storerh_io:
+    // memh(Rs+#u3:1) = Rt
+    Src1Reg = MCI.getOperand(0).getReg();
+    Src2Reg = MCI.getOperand(2).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) &&
+        MCI.getOperand(1).isImm() &&
+        isShiftedUInt<3, 1>(MCI.getOperand(1).getImm())) {
+      return HexagonII::HSIG_S2;
+    }
+    break;
+  case Hexagon::S2_storerd_io:
+    // memd(r29+#s6:3) = Rtt
+    Src1Reg = MCI.getOperand(0).getReg();
+    Src2Reg = MCI.getOperand(2).getReg();
+    if (HexagonMCInstrInfo::isDblRegForSubInst(Src2Reg) &&
+        HexagonMCInstrInfo::isIntReg(Src1Reg) && Hexagon::R29 == Src1Reg &&
+        MCI.getOperand(1).isImm() &&
+        isShiftedInt<6, 3>(MCI.getOperand(1).getImm())) {
+      return HexagonII::HSIG_S2;
+    }
+    break;
+  case Hexagon::S4_storeiri_io:
+    // memw(Rs+#u4:2) = #U1
+    Src1Reg = MCI.getOperand(0).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+        MCI.getOperand(1).isImm() &&
+        isShiftedUInt<4, 2>(MCI.getOperand(1).getImm()) &&
+        MCI.getOperand(2).isImm() && isUInt<1>(MCI.getOperand(2).getImm())) {
+      return HexagonII::HSIG_S2;
+    }
+    break;
+  case Hexagon::S4_storeirb_io:
+    // memb(Rs+#u4) = #U1
+    Src1Reg = MCI.getOperand(0).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) &&
+        MCI.getOperand(1).isImm() && isUInt<4>(MCI.getOperand(1).getImm()) &&
+        MCI.getOperand(2).isImm() && MCI.getOperand(2).isImm() &&
+        isUInt<1>(MCI.getOperand(2).getImm())) {
+      return HexagonII::HSIG_S2;
+    }
+    break;
+  case Hexagon::S2_allocframe:
+    if (MCI.getOperand(0).isImm() &&
+        isShiftedUInt<5, 3>(MCI.getOperand(0).getImm())) {
+      return HexagonII::HSIG_S2;
+    }
+    break;
+  //
+  // Group A:
+  //
+  // Rx = add(Rx,#s7)
+  // Rd = Rs
+  // Rd = #u6
+  // Rd = #-1
+  // if ([!]P0[.new]) Rd = #0
+  // Rd = add(r29,#u6:2)
+  // Rx = add(Rx,Rs)
+  // P0 = cmp.eq(Rs,#u2)
+  // Rdd = combine(#0,Rs)
+  // Rdd = combine(Rs,#0)
+  // Rdd = combine(#u2,#U2)
+  // Rd = add(Rs,#1)
+  // Rd = add(Rs,#-1)
+  // Rd = sxth/sxtb/zxtb/zxth(Rs)
+  // Rd = and(Rs,#1)
+  case Hexagon::A2_addi:
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
+      // Rd = add(r29,#u6:2)
+      if (HexagonMCInstrInfo::isIntReg(SrcReg) && Hexagon::R29 == SrcReg &&
+          MCI.getOperand(2).isImm() &&
+          isShiftedUInt<6, 2>(MCI.getOperand(2).getImm())) {
+        return HexagonII::HSIG_A;
+      }
+      // Rx = add(Rx,#s7)
+      if (DstReg == SrcReg) {
+        return HexagonII::HSIG_A;
+      }
+      // Rd = add(Rs,#1)
+      // Rd = add(Rs,#-1)
+      if (HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+          MCI.getOperand(2).isImm() && ((MCI.getOperand(2).getImm() == 1) ||
+                                        (MCI.getOperand(2).getImm() == -1))) {
+        return HexagonII::HSIG_A;
+      }
+    }
+    break;
+  case Hexagon::A2_add:
+    // Rx = add(Rx,Rs)
+    DstReg = MCI.getOperand(0).getReg();
+    Src1Reg = MCI.getOperand(1).getReg();
+    Src2Reg = MCI.getOperand(2).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) && (DstReg == Src1Reg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg)) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::A2_andir:
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+        MCI.getOperand(2).isImm() && ((MCI.getOperand(2).getImm() == 1) ||
+                                      (MCI.getOperand(2).getImm() == 255))) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::A2_tfr:
+    // Rd = Rs
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg)) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::A2_tfrsi:
+    DstReg = MCI.getOperand(0).getReg();
+
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::C2_cmoveit:
+  case Hexagon::C2_cmovenewit:
+  case Hexagon::C2_cmoveif:
+  case Hexagon::C2_cmovenewif:
+    // if ([!]P0[.new]) Rd = #0
+    // Actual form:
+    // %R16<def> = C2_cmovenewit %P0<internal>, 0, %R16<imp-use,undef>;
+    DstReg = MCI.getOperand(0).getReg();  // Rd
+    PredReg = MCI.getOperand(1).getReg(); // P0
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+        Hexagon::P0 == PredReg && MCI.getOperand(2).isImm() &&
+        MCI.getOperand(2).getImm() == 0) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::C2_cmpeqi:
+    // P0 = cmp.eq(Rs,#u2)
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (Hexagon::P0 == DstReg &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+        MCI.getOperand(2).isImm() && isUInt<2>(MCI.getOperand(2).getImm())) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::A2_combineii:
+  case Hexagon::A4_combineii:
+    // Rdd = combine(#u2,#U2)
+    DstReg = MCI.getOperand(0).getReg();
+    if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
+        // TODO: Handle Globals/Symbols
+        (MCI.getOperand(1).isImm() && isUInt<2>(MCI.getOperand(1).getImm())) &&
+        ((MCI.getOperand(2).isImm() &&
+          isUInt<2>(MCI.getOperand(2).getImm())))) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::A4_combineri:
+    // Rdd = combine(Rs,#0)
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+        (MCI.getOperand(2).isImm() && MCI.getOperand(2).getImm() == 0)) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::A4_combineir:
+    // Rdd = combine(#0,Rs)
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(2).getReg();
+    if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) &&
+        (MCI.getOperand(1).isImm() && MCI.getOperand(1).getImm() == 0)) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  case Hexagon::A2_sxtb:
+  case Hexagon::A2_sxth:
+  case Hexagon::A2_zxtb:
+  case Hexagon::A2_zxth:
+    // Rd = sxth/sxtb/zxtb/zxth(Rs)
+    DstReg = MCI.getOperand(0).getReg();
+    SrcReg = MCI.getOperand(1).getReg();
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) &&
+        HexagonMCInstrInfo::isIntRegForSubInst(SrcReg)) {
+      return HexagonII::HSIG_A;
+    }
+    break;
+  }
+
+  return HexagonII::HSIG_None;
+}
+
+bool HexagonMCInstrInfo::subInstWouldBeExtended(MCInst const &potentialDuplex) {
+
+  unsigned DstReg, SrcReg;
+
+  switch (potentialDuplex.getOpcode()) {
+  case Hexagon::A2_addi:
+    // testing for case of: Rx = add(Rx,#s7)
+    DstReg = potentialDuplex.getOperand(0).getReg();
+    SrcReg = potentialDuplex.getOperand(1).getReg();
+    if (DstReg == SrcReg && HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
+      if (potentialDuplex.getOperand(2).isExpr())
+        return true;
+      if (potentialDuplex.getOperand(2).isImm() &&
+          !(isShiftedInt<7, 0>(potentialDuplex.getOperand(2).getImm())))
+        return true;
+    }
+    break;
+  case Hexagon::A2_tfrsi:
+    DstReg = potentialDuplex.getOperand(0).getReg();
+
+    if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) {
+      if (potentialDuplex.getOperand(1).isExpr())
+        return true;
+      // Check for case of Rd = #-1.
+      if (potentialDuplex.getOperand(1).isImm() &&
+          (potentialDuplex.getOperand(1).getImm() == -1))
+        return false;
+      // Check for case of Rd = #u6.
+      if (potentialDuplex.getOperand(1).isImm() &&
+          !isShiftedUInt<6, 0>(potentialDuplex.getOperand(1).getImm()))
+        return true;
+    }
+    break;
+  default:
+    break;
+  }
+  return false;
+}
+
+/// non-Symmetrical. See if these two instructions are fit for duplex pair.
+bool HexagonMCInstrInfo::isOrderedDuplexPair(MCInstrInfo const &MCII,
+                                             MCInst const &MIa, bool ExtendedA,
+                                             MCInst const &MIb, bool ExtendedB,
+                                             bool bisReversable) {
+  // Slot 1 cannot be extended in duplexes PRM 10.5
+  if (ExtendedA)
+    return false;
+  // Only A2_addi and A2_tfrsi can be extended in duplex form PRM 10.5
+  if (ExtendedB) {
+    unsigned Opcode = MIb.getOpcode();
+    if ((Opcode != Hexagon::A2_addi) && (Opcode != Hexagon::A2_tfrsi))
+      return false;
+  }
+  unsigned MIaG = HexagonMCInstrInfo::getDuplexCandidateGroup(MIa),
+           MIbG = HexagonMCInstrInfo::getDuplexCandidateGroup(MIb);
+
+  // If a duplex contains 2 insns in the same group, the insns must be
+  // ordered such that the numerically smaller opcode is in slot 1.
+  if ((MIaG != HexagonII::HSIG_None) && (MIaG == MIbG) && bisReversable) {
+    MCInst SubInst0 = HexagonMCInstrInfo::deriveSubInst(MIa);
+    MCInst SubInst1 = HexagonMCInstrInfo::deriveSubInst(MIb);
+
+    unsigned zeroedSubInstS0 =
+        subinstOpcodeMap.find(SubInst0.getOpcode())->second;
+    unsigned zeroedSubInstS1 =
+        subinstOpcodeMap.find(SubInst1.getOpcode())->second;
+
+    if (zeroedSubInstS0 < zeroedSubInstS1)
+      // subinstS0 (maps to slot 0) must be greater than
+      // subinstS1 (maps to slot 1)
+      return false;
+  }
+
+  // allocframe must always be in slot 0
+  if (MIb.getOpcode() == Hexagon::S2_allocframe)
+    return false;
+
+  if ((MIaG != HexagonII::HSIG_None) && (MIbG != HexagonII::HSIG_None)) {
+    // Prevent 2 instructions with extenders from duplexing
+    // Note that MIb (slot1) can be extended and MIa (slot0)
+    //   can never be extended
+    if (subInstWouldBeExtended(MIa))
+      return false;
+
+    // If duplexing produces an extender, but the original did not
+    //   have an extender, do not duplex.
+    if (subInstWouldBeExtended(MIb) && !ExtendedB)
+      return false;
+  }
+
+  // If jumpr r31 appears, it must be in slot 0, and never slot 1 (MIb).
+  if (MIbG == HexagonII::HSIG_L2) {
+    if ((MIb.getNumOperands() > 1) && MIb.getOperand(1).isReg() &&
+        (MIb.getOperand(1).getReg() == Hexagon::R31))
+      return false;
+    if ((MIb.getNumOperands() > 0) && MIb.getOperand(0).isReg() &&
+        (MIb.getOperand(0).getReg() == Hexagon::R31))
+      return false;
+  }
+
+  // If a store appears, it must be in slot 0 (MIa) 1st, and then slot 1 (MIb);
+  //   therefore, not duplexable if slot 1 is a store, and slot 0 is not.
+  if ((MIbG == HexagonII::HSIG_S1) || (MIbG == HexagonII::HSIG_S2)) {
+    if ((MIaG != HexagonII::HSIG_S1) && (MIaG != HexagonII::HSIG_S2))
+      return false;
+  }
+
+  return (isDuplexPairMatch(MIaG, MIbG));
+}
+
+/// Symmetrical. See if these two instructions are fit for duplex pair.
+bool HexagonMCInstrInfo::isDuplexPair(MCInst const &MIa, MCInst const &MIb) {
+  unsigned MIaG = getDuplexCandidateGroup(MIa),
+           MIbG = getDuplexCandidateGroup(MIb);
+  return (isDuplexPairMatch(MIaG, MIbG) || isDuplexPairMatch(MIbG, MIaG));
+}
+
+inline static void addOps(MCInst &subInstPtr, MCInst const &Inst,
+                          unsigned opNum) {
+  if (Inst.getOperand(opNum).isReg()) {
+    switch (Inst.getOperand(opNum).getReg()) {
+    default:
+      llvm_unreachable("Not Duplexable Register");
+      break;
+    case Hexagon::R0:
+    case Hexagon::R1:
+    case Hexagon::R2:
+    case Hexagon::R3:
+    case Hexagon::R4:
+    case Hexagon::R5:
+    case Hexagon::R6:
+    case Hexagon::R7:
+    case Hexagon::D0:
+    case Hexagon::D1:
+    case Hexagon::D2:
+    case Hexagon::D3:
+    case Hexagon::R16:
+    case Hexagon::R17:
+    case Hexagon::R18:
+    case Hexagon::R19:
+    case Hexagon::R20:
+    case Hexagon::R21:
+    case Hexagon::R22:
+    case Hexagon::R23:
+    case Hexagon::D8:
+    case Hexagon::D9:
+    case Hexagon::D10:
+    case Hexagon::D11:
+      subInstPtr.addOperand(Inst.getOperand(opNum));
+      break;
+    }
+  } else
+    subInstPtr.addOperand(Inst.getOperand(opNum));
+}
+
+MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
+  MCInst Result;
+  switch (Inst.getOpcode()) {
+  default:
+    // dbgs() << "opcode: "<< Inst->getOpcode() << "\n";
+    llvm_unreachable("Unimplemented subinstruction \n");
+    break;
+  case Hexagon::A2_addi:
+    if (Inst.getOperand(2).isImm() && Inst.getOperand(2).getImm() == 1) {
+      Result.setOpcode(Hexagon::V4_SA1_inc);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      break;
+    } //  1,2 SUBInst $Rd = add($Rs, #1)
+    else if (Inst.getOperand(2).isImm() && Inst.getOperand(2).getImm() == -1) {
+      Result.setOpcode(Hexagon::V4_SA1_dec);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      break;
+    } //  1,2 SUBInst $Rd = add($Rs,#-1)
+    else if (Inst.getOperand(1).getReg() == Hexagon::R29) {
+      Result.setOpcode(Hexagon::V4_SA1_addsp);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 2);
+      break;
+    } //  1,3 SUBInst $Rd = add(r29, #$u6_2)
+    else {
+      Result.setOpcode(Hexagon::V4_SA1_addi);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      addOps(Result, Inst, 2);
+      break;
+    } //    1,2,3 SUBInst $Rx = add($Rx, #$s7)
+  case Hexagon::A2_add:
+    Result.setOpcode(Hexagon::V4_SA1_addrx);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    1,2,3 SUBInst $Rx = add($_src_, $Rs)
+  case Hexagon::S2_allocframe:
+    Result.setOpcode(Hexagon::V4_SS2_allocframe);
+    addOps(Result, Inst, 0);
+    break; //    1 SUBInst allocframe(#$u5_3)
+  case Hexagon::A2_andir:
+    if (Inst.getOperand(2).getImm() == 255) {
+      Result.setOpcode(Hexagon::V4_SA1_zxtb);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      break; //    1,2    $Rd = and($Rs, #255)
+    } else {
+      Result.setOpcode(Hexagon::V4_SA1_and1);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      break; //    1,2 SUBInst $Rd = and($Rs, #1)
+    }
+  case Hexagon::C2_cmpeqi:
+    Result.setOpcode(Hexagon::V4_SA1_cmpeqi);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    2,3 SUBInst p0 = cmp.eq($Rs, #$u2)
+  case Hexagon::A4_combineii:
+  case Hexagon::A2_combineii:
+    if (Inst.getOperand(1).getImm() == 1) {
+      Result.setOpcode(Hexagon::V4_SA1_combine1i);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 2);
+      break; //  1,3 SUBInst $Rdd = combine(#1, #$u2)
+    }
+
+    if (Inst.getOperand(1).getImm() == 3) {
+      Result.setOpcode(Hexagon::V4_SA1_combine3i);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 2);
+      break; //  1,3 SUBInst $Rdd = combine(#3, #$u2)
+    }
+    if (Inst.getOperand(1).getImm() == 0) {
+      Result.setOpcode(Hexagon::V4_SA1_combine0i);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 2);
+      break; //  1,3 SUBInst $Rdd = combine(#0, #$u2)
+    }
+    if (Inst.getOperand(1).getImm() == 2) {
+      Result.setOpcode(Hexagon::V4_SA1_combine2i);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 2);
+      break; //  1,3 SUBInst $Rdd = combine(#2, #$u2)
+    }
+  case Hexagon::A4_combineir:
+    Result.setOpcode(Hexagon::V4_SA1_combinezr);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 2);
+    break; //    1,3 SUBInst $Rdd = combine(#0, $Rs)
+
+  case Hexagon::A4_combineri:
+    Result.setOpcode(Hexagon::V4_SA1_combinerz);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    break; //    1,2 SUBInst $Rdd = combine($Rs, #0)
+  case Hexagon::L4_return_tnew_pnt:
+  case Hexagon::L4_return_tnew_pt:
+    Result.setOpcode(Hexagon::V4_SL2_return_tnew);
+    break; //    none  SUBInst if (p0.new) dealloc_return:nt
+  case Hexagon::L4_return_fnew_pnt:
+  case Hexagon::L4_return_fnew_pt:
+    Result.setOpcode(Hexagon::V4_SL2_return_fnew);
+    break; //    none  SUBInst if (!p0.new) dealloc_return:nt
+  case Hexagon::L4_return_f:
+    Result.setOpcode(Hexagon::V4_SL2_return_f);
+    break; //    none  SUBInst if (!p0) dealloc_return
+  case Hexagon::L4_return_t:
+    Result.setOpcode(Hexagon::V4_SL2_return_t);
+    break; //    none  SUBInst if (p0) dealloc_return
+  case Hexagon::L4_return:
+    Result.setOpcode(Hexagon::V4_SL2_return);
+    break; //    none  SUBInst dealloc_return
+  case Hexagon::L2_deallocframe:
+    Result.setOpcode(Hexagon::V4_SL2_deallocframe);
+    break; //    none  SUBInst deallocframe
+  case Hexagon::EH_RETURN_JMPR:
+  case Hexagon::J2_jumpr:
+  case Hexagon::JMPret:
+    Result.setOpcode(Hexagon::V4_SL2_jumpr31);
+    break; //    none  SUBInst jumpr r31
+  case Hexagon::J2_jumprf:
+  case Hexagon::JMPretf:
+    Result.setOpcode(Hexagon::V4_SL2_jumpr31_f);
+    break; //    none  SUBInst if (!p0) jumpr r31
+  case Hexagon::J2_jumprfnew:
+  case Hexagon::JMPretfnewpt:
+  case Hexagon::JMPretfnew:
+    Result.setOpcode(Hexagon::V4_SL2_jumpr31_fnew);
+    break; //    none  SUBInst if (!p0.new) jumpr:nt r31
+  case Hexagon::J2_jumprt:
+  case Hexagon::JMPrett:
+    Result.setOpcode(Hexagon::V4_SL2_jumpr31_t);
+    break; //    none  SUBInst if (p0) jumpr r31
+  case Hexagon::J2_jumprtnew:
+  case Hexagon::JMPrettnewpt:
+  case Hexagon::JMPrettnew:
+    Result.setOpcode(Hexagon::V4_SL2_jumpr31_tnew);
+    break; //    none  SUBInst if (p0.new) jumpr:nt r31
+  case Hexagon::L2_loadrb_io:
+    Result.setOpcode(Hexagon::V4_SL2_loadrb_io);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    1,2,3 SUBInst $Rd = memb($Rs + #$u3_0)
+  case Hexagon::L2_loadrd_io:
+    Result.setOpcode(Hexagon::V4_SL2_loadrd_sp);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 2);
+    break; //    1,3 SUBInst $Rdd = memd(r29 + #$u5_3)
+  case Hexagon::L2_loadrh_io:
+    Result.setOpcode(Hexagon::V4_SL2_loadrh_io);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    1,2,3 SUBInst $Rd = memh($Rs + #$u3_1)
+  case Hexagon::L2_loadrub_io:
+    Result.setOpcode(Hexagon::V4_SL1_loadrub_io);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    1,2,3 SUBInst $Rd = memub($Rs + #$u4_0)
+  case Hexagon::L2_loadruh_io:
+    Result.setOpcode(Hexagon::V4_SL2_loadruh_io);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    1,2,3 SUBInst $Rd = memuh($Rs + #$u3_1)
+  case Hexagon::L2_loadri_io:
+    if (Inst.getOperand(1).getReg() == Hexagon::R29) {
+      Result.setOpcode(Hexagon::V4_SL2_loadri_sp);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 2);
+      break; //  2 1,3 SUBInst $Rd = memw(r29 + #$u5_2)
+    } else {
+      Result.setOpcode(Hexagon::V4_SL1_loadri_io);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      addOps(Result, Inst, 2);
+      break; //    1,2,3 SUBInst $Rd = memw($Rs + #$u4_2)
+    }
+  case Hexagon::S4_storeirb_io:
+    if (Inst.getOperand(2).getImm() == 0) {
+      Result.setOpcode(Hexagon::V4_SS2_storebi0);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      break; //    1,2 SUBInst memb($Rs + #$u4_0)=#0
+    } else if (Inst.getOperand(2).getImm() == 1) {
+      Result.setOpcode(Hexagon::V4_SS2_storebi1);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      break; //  2 1,2 SUBInst memb($Rs + #$u4_0)=#1
+    }
+  case Hexagon::S2_storerb_io:
+    Result.setOpcode(Hexagon::V4_SS1_storeb_io);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    1,2,3 SUBInst memb($Rs + #$u4_0) = $Rt
+  case Hexagon::S2_storerd_io:
+    Result.setOpcode(Hexagon::V4_SS2_stored_sp);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    2,3 SUBInst memd(r29 + #$s6_3) = $Rtt
+  case Hexagon::S2_storerh_io:
+    Result.setOpcode(Hexagon::V4_SS2_storeh_io);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    1,2,3 SUBInst memb($Rs + #$u4_0) = $Rt
+  case Hexagon::S4_storeiri_io:
+    if (Inst.getOperand(2).getImm() == 0) {
+      Result.setOpcode(Hexagon::V4_SS2_storewi0);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      break; //  3 1,2 SUBInst memw($Rs + #$u4_2)=#0
+    } else if (Inst.getOperand(2).getImm() == 1) {
+      Result.setOpcode(Hexagon::V4_SS2_storewi1);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      break; //  3 1,2 SUBInst memw($Rs + #$u4_2)=#1
+    } else if (Inst.getOperand(0).getReg() == Hexagon::R29) {
+      Result.setOpcode(Hexagon::V4_SS2_storew_sp);
+      addOps(Result, Inst, 1);
+      addOps(Result, Inst, 2);
+      break; //  1 2,3 SUBInst memw(r29 + #$u5_2) = $Rt
+    }
+  case Hexagon::S2_storeri_io:
+    if (Inst.getOperand(0).getReg() == Hexagon::R29) {
+      Result.setOpcode(Hexagon::V4_SS2_storew_sp);
+      addOps(Result, Inst, 1);
+      addOps(Result, Inst, 2); //  1,2,3 SUBInst memw(sp + #$u5_2) = $Rt
+    } else {
+      Result.setOpcode(Hexagon::V4_SS1_storew_io);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      addOps(Result, Inst, 2); //  1,2,3 SUBInst memw($Rs + #$u4_2) = $Rt
+    }
+    break;
+  case Hexagon::A2_sxtb:
+    Result.setOpcode(Hexagon::V4_SA1_sxtb);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    break; //  1,2 SUBInst $Rd = sxtb($Rs)
+  case Hexagon::A2_sxth:
+    Result.setOpcode(Hexagon::V4_SA1_sxth);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    break; //  1,2 SUBInst $Rd = sxth($Rs)
+  case Hexagon::A2_tfr:
+    Result.setOpcode(Hexagon::V4_SA1_tfr);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    break; //  1,2 SUBInst $Rd = $Rs
+  case Hexagon::C2_cmovenewif:
+    Result.setOpcode(Hexagon::V4_SA1_clrfnew);
+    addOps(Result, Inst, 0);
+    break; //  2 SUBInst if (!p0.new) $Rd = #0
+  case Hexagon::C2_cmovenewit:
+    Result.setOpcode(Hexagon::V4_SA1_clrtnew);
+    addOps(Result, Inst, 0);
+    break; //  2 SUBInst if (p0.new) $Rd = #0
+  case Hexagon::C2_cmoveif:
+    Result.setOpcode(Hexagon::V4_SA1_clrf);
+    addOps(Result, Inst, 0);
+    break; //  2 SUBInst if (!p0) $Rd = #0
+  case Hexagon::C2_cmoveit:
+    Result.setOpcode(Hexagon::V4_SA1_clrt);
+    addOps(Result, Inst, 0);
+    break; //  2 SUBInst if (p0) $Rd = #0
+  case Hexagon::A2_tfrsi:
+    if (Inst.getOperand(1).isImm() && Inst.getOperand(1).getImm() == -1) {
+      Result.setOpcode(Hexagon::V4_SA1_setin1);
+      addOps(Result, Inst, 0);
+      break; //  2 1 SUBInst $Rd = #-1
+    } else {
+      Result.setOpcode(Hexagon::V4_SA1_seti);
+      addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
+      break; //    1,2 SUBInst $Rd = #$u6
+    }
+  case Hexagon::A2_zxtb:
+    Result.setOpcode(Hexagon::V4_SA1_zxtb);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    break; //    1,2    $Rd = and($Rs, #255)
+
+  case Hexagon::A2_zxth:
+    Result.setOpcode(Hexagon::V4_SA1_zxth);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    break; //    1,2 SUBInst $Rd = zxth($Rs)
+  }
+  return Result;
+}
+
+static bool isStoreInst(unsigned opCode) {
+  switch (opCode) {
+  case Hexagon::S2_storeri_io:
+  case Hexagon::S2_storerb_io:
+  case Hexagon::S2_storerh_io:
+  case Hexagon::S2_storerd_io:
+  case Hexagon::S4_storeiri_io:
+  case Hexagon::S4_storeirb_io:
+  case Hexagon::S2_allocframe:
+    return true;
+  default:
+    return false;
+  }
+}
+
+SmallVector<DuplexCandidate, 8>
+HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
+                                          MCInst const &MCB) {
+  assert(isBundle(MCB));
+  SmallVector<DuplexCandidate, 8> duplexToTry;
+  // Use an "order matters" version of isDuplexPair.
+  unsigned numInstrInPacket = MCB.getNumOperands();
+
+  for (unsigned distance = 1; distance < numInstrInPacket; ++distance) {
+    for (unsigned j = HexagonMCInstrInfo::bundleInstructionsOffset,
+                  k = j + distance;
+         (j < numInstrInPacket) && (k < numInstrInPacket); ++j, ++k) {
+
+      // Check if reversable.
+      bool bisReversable = true;
+      if (isStoreInst(MCB.getOperand(j).getInst()->getOpcode()) &&
+          isStoreInst(MCB.getOperand(k).getInst()->getOpcode())) {
+        DEBUG(dbgs() << "skip out of order write pair: " << k << "," << j
+                     << "\n");
+        bisReversable = false;
+      }
+
+      // Try in order.
+      if (isOrderedDuplexPair(
+              MCII, *MCB.getOperand(k).getInst(),
+              HexagonMCInstrInfo::hasExtenderForIndex(MCB, k - 1),
+              *MCB.getOperand(j).getInst(),
+              HexagonMCInstrInfo::hasExtenderForIndex(MCB, j - 1),
+              bisReversable)) {
+        // Get iClass.
+        unsigned iClass = iClassOfDuplexPair(
+            getDuplexCandidateGroup(*MCB.getOperand(k).getInst()),
+            getDuplexCandidateGroup(*MCB.getOperand(j).getInst()));
+
+        // Save off pairs for duplex checking.
+        duplexToTry.push_back(DuplexCandidate(j, k, iClass));
+        DEBUG(dbgs() << "adding pair: " << j << "," << k << ":"
+                     << MCB.getOperand(j).getInst()->getOpcode() << ","
+                     << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+        continue;
+      } else {
+        DEBUG(dbgs() << "skipping pair: " << j << "," << k << ":"
+                     << MCB.getOperand(j).getInst()->getOpcode() << ","
+                     << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+      }
+
+      // Try reverse.
+      if (bisReversable) {
+        if (isOrderedDuplexPair(
+                MCII, *MCB.getOperand(j).getInst(),
+                HexagonMCInstrInfo::hasExtenderForIndex(MCB, j - 1),
+                *MCB.getOperand(k).getInst(),
+                HexagonMCInstrInfo::hasExtenderForIndex(MCB, k - 1),
+                bisReversable)) {
+          // Get iClass.
+          unsigned iClass = iClassOfDuplexPair(
+              getDuplexCandidateGroup(*MCB.getOperand(j).getInst()),
+              getDuplexCandidateGroup(*MCB.getOperand(k).getInst()));
+
+          // Save off pairs for duplex checking.
+          duplexToTry.push_back(DuplexCandidate(k, j, iClass));
+          DEBUG(dbgs() << "adding pair:" << k << "," << j << ":"
+                       << MCB.getOperand(j).getInst()->getOpcode() << ","
+                       << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+        } else {
+          DEBUG(dbgs() << "skipping pair: " << k << "," << j << ":"
+                       << MCB.getOperand(j).getInst()->getOpcode() << ","
+                       << MCB.getOperand(k).getInst()->getOpcode() << "\n");
+        }
+      }
+    }
+  }
+  return duplexToTry;
+}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index 93c7a0d98bf2..2731278f0e41 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -12,12 +12,53 @@
 //===----------------------------------------------------------------------===//
 
 #include "HexagonMCInstrInfo.h"
+
+#include "Hexagon.h"
 #include "HexagonBaseInfo.h"
 
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
 namespace llvm {
-void HexagonMCInstrInfo::AppendImplicitOperands(MCInst &MCI) {
-  MCI.addOperand(MCOperand::createImm(0));
-  MCI.addOperand(MCOperand::createInst(nullptr));
+iterator_range<MCInst::const_iterator>
+HexagonMCInstrInfo::bundleInstructions(MCInst const &MCI) {
+  assert(isBundle(MCI));
+  return iterator_range<MCInst::const_iterator>(
+      MCI.begin() + bundleInstructionsOffset, MCI.end());
+}
+
+size_t HexagonMCInstrInfo::bundleSize(MCInst const &MCI) {
+  if (HexagonMCInstrInfo::isBundle(MCI))
+    return (MCI.size() - bundleInstructionsOffset);
+  else
+    return (1);
+}
+
+MCInst *HexagonMCInstrInfo::deriveDuplex(MCContext &Context, unsigned iClass,
+                                         MCInst const &inst0,
+                                         MCInst const &inst1) {
+  assert((iClass <= 0xf) && "iClass must have range of 0 to 0xf");
+  MCInst *duplexInst = new (Context) MCInst;
+  duplexInst->setOpcode(Hexagon::DuplexIClass0 + iClass);
+
+  MCInst *SubInst0 = new (Context) MCInst(deriveSubInst(inst0));
+  MCInst *SubInst1 = new (Context) MCInst(deriveSubInst(inst1));
+  duplexInst->addOperand(MCOperand::createInst(SubInst0));
+  duplexInst->addOperand(MCOperand::createInst(SubInst1));
+  return duplexInst;
+}
+
+MCInst const *HexagonMCInstrInfo::extenderForIndex(MCInst const &MCB,
+                                                   size_t Index) {
+  assert(Index <= bundleSize(MCB));
+  if (Index == 0)
+    return nullptr;
+  MCInst const *Inst =
+      MCB.getOperand(Index + bundleInstructionsOffset - 1).getInst();
+  if (isImmext(*Inst))
+    return Inst;
+  return nullptr;
 }
 
 HexagonII::MemAccessSize
@@ -46,6 +87,24 @@ MCInstrDesc const &HexagonMCInstrInfo::getDesc(MCInstrInfo const &MCII,
   return (MCII.get(MCI.getOpcode()));
 }
 
+unsigned short HexagonMCInstrInfo::getExtendableOp(MCInstrInfo const &MCII,
+                                                   MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask);
+}
+
+MCOperand const &
+HexagonMCInstrInfo::getExtendableOperand(MCInstrInfo const &MCII,
+                                         MCInst const &MCI) {
+  unsigned O = HexagonMCInstrInfo::getExtendableOp(MCII, MCI);
+  MCOperand const &MO = MCI.getOperand(O);
+
+  assert((HexagonMCInstrInfo::isExtendable(MCII, MCI) ||
+          HexagonMCInstrInfo::isExtended(MCII, MCI)) &&
+         (MO.isImm() || MO.isExpr()));
+  return (MO);
+}
+
 unsigned HexagonMCInstrInfo::getExtentAlignment(MCInstrInfo const &MCII,
                                                 MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -58,12 +117,6 @@ unsigned HexagonMCInstrInfo::getExtentBits(MCInstrInfo const &MCII,
   return ((F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask);
 }
 
-std::bitset<16> HexagonMCInstrInfo::GetImplicitBits(MCInst const &MCI) {
-  SanityCheckImplicitOperands(MCI);
-  std::bitset<16> Bits(MCI.getOperand(MCI.getNumOperands() - 2).getImm());
-  return Bits;
-}
-
 // Return the max value that a constant extendable operand can have
 // without being extended.
 int HexagonMCInstrInfo::getMaxValue(MCInstrInfo const &MCII,
@@ -99,9 +152,14 @@ char const *HexagonMCInstrInfo::getName(MCInstrInfo const &MCII,
   return MCII.getName(MCI.getOpcode());
 }
 
-// Return the operand that consumes or produces a new value.
-MCOperand const &HexagonMCInstrInfo::getNewValue(MCInstrInfo const &MCII,
+unsigned short HexagonMCInstrInfo::getNewValueOp(MCInstrInfo const &MCII,
                                                  MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::NewValueOpPos) & HexagonII::NewValueOpMask);
+}
+
+MCOperand const &HexagonMCInstrInfo::getNewValueOperand(MCInstrInfo const &MCII,
+                                                        MCInst const &MCI) {
   uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
   unsigned const O =
       (F >> HexagonII::NewValueOpPos) & HexagonII::NewValueOpMask;
@@ -113,6 +171,21 @@ MCOperand const &HexagonMCInstrInfo::getNewValue(MCInstrInfo const &MCII,
   return (MCO);
 }
 
+int HexagonMCInstrInfo::getSubTarget(MCInstrInfo const &MCII,
+                                     MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+
+  HexagonII::SubTarget Target = static_cast<HexagonII::SubTarget>(
+      (F >> HexagonII::validSubTargetPos) & HexagonII::validSubTargetMask);
+
+  switch (Target) {
+  default:
+    return Hexagon::ArchV4;
+  case HexagonII::HasV5SubT:
+    return Hexagon::ArchV5;
+  }
+}
+
 // Return the Hexagon ISA class for the insn.
 unsigned HexagonMCInstrInfo::getType(MCInstrInfo const &MCII,
                                      MCInst const &MCI) {
@@ -121,6 +194,32 @@ unsigned HexagonMCInstrInfo::getType(MCInstrInfo const &MCII,
   return ((F >> HexagonII::TypePos) & HexagonII::TypeMask);
 }
 
+unsigned HexagonMCInstrInfo::getUnits(MCInstrInfo const &MCII,
+                                      MCSubtargetInfo const &STI,
+                                      MCInst const &MCI) {
+
+  const InstrItinerary *II = STI.getSchedModel().InstrItineraries;
+  int SchedClass = HexagonMCInstrInfo::getDesc(MCII, MCI).getSchedClass();
+  return ((II[SchedClass].FirstStage + HexagonStages)->getUnits());
+}
+
+bool HexagonMCInstrInfo::hasImmExt(MCInst const &MCI) {
+  if (!HexagonMCInstrInfo::isBundle(MCI))
+    return false;
+
+  for (const auto &I : HexagonMCInstrInfo::bundleInstructions(MCI)) {
+    auto MI = I.getInst();
+    if (isImmext(*MI))
+      return true;
+  }
+
+  return false;
+}
+
+bool HexagonMCInstrInfo::hasExtenderForIndex(MCInst const &MCB, size_t Index) {
+  return extenderForIndex(MCB, Index) != nullptr;
+}
+
 // Return whether the instruction is a legal new-value producer.
 bool HexagonMCInstrInfo::hasNewValue(MCInstrInfo const &MCII,
                                      MCInst const &MCI) {
@@ -128,6 +227,18 @@ bool HexagonMCInstrInfo::hasNewValue(MCInstrInfo const &MCII,
   return ((F >> HexagonII::hasNewValuePos) & HexagonII::hasNewValueMask);
 }
 
+MCInst const &HexagonMCInstrInfo::instruction(MCInst const &MCB, size_t Index) {
+  assert(isBundle(MCB));
+  assert(Index < HEXAGON_PACKET_SIZE);
+  return *MCB.getOperand(bundleInstructionsOffset + Index).getInst();
+}
+
+bool HexagonMCInstrInfo::isBundle(MCInst const &MCI) {
+  auto Result = Hexagon::BUNDLE == MCI.getOpcode();
+  assert(!Result || (MCI.size() > 0 && MCI.getOperand(0).isImm()));
+  return Result;
+}
+
 // Return whether the insn is an actual insn.
 bool HexagonMCInstrInfo::isCanon(MCInstrInfo const &MCII, MCInst const &MCI) {
   return (!HexagonMCInstrInfo::getDesc(MCII, MCI).isPseudo() &&
@@ -135,6 +246,15 @@ bool HexagonMCInstrInfo::isCanon(MCInstrInfo const &MCII, MCInst const &MCI) {
           HexagonMCInstrInfo::getType(MCII, MCI) != HexagonII::TypeENDLOOP);
 }
 
+bool HexagonMCInstrInfo::isDblRegForSubInst(unsigned Reg) {
+  return ((Reg >= Hexagon::D0 && Reg <= Hexagon::D3) ||
+          (Reg >= Hexagon::D8 && Reg <= Hexagon::D11));
+}
+
+bool HexagonMCInstrInfo::isDuplex(MCInstrInfo const &MCII, MCInst const &MCI) {
+  return HexagonII::TypeDUPLEX == HexagonMCInstrInfo::getType(MCII, MCI);
+}
+
 // Return whether the instruction needs to be constant extended.
 // 1) Always return true if the instruction has 'isExtended' flag set.
 //
@@ -173,20 +293,44 @@ bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII,
   return (ImmValue < MinValue || ImmValue > MaxValue);
 }
 
-// Return true if the instruction may be extended based on the operand value.
 bool HexagonMCInstrInfo::isExtendable(MCInstrInfo const &MCII,
                                       MCInst const &MCI) {
   uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
   return (F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask;
 }
 
-// Return whether the instruction must be always extended.
 bool HexagonMCInstrInfo::isExtended(MCInstrInfo const &MCII,
                                     MCInst const &MCI) {
   uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
   return (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
 }
 
+bool HexagonMCInstrInfo::isFloat(MCInstrInfo const &MCII, MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::FPPos) & HexagonII::FPMask);
+}
+
+bool HexagonMCInstrInfo::isImmext(MCInst const &MCI) {
+  auto Op = MCI.getOpcode();
+  return (Op == Hexagon::A4_ext_b || Op == Hexagon::A4_ext_c ||
+          Op == Hexagon::A4_ext_g || Op == Hexagon::A4_ext);
+}
+
+bool HexagonMCInstrInfo::isInnerLoop(MCInst const &MCI) {
+  assert(isBundle(MCI));
+  int64_t Flags = MCI.getOperand(0).getImm();
+  return (Flags & innerLoopMask) != 0;
+}
+
+bool HexagonMCInstrInfo::isIntReg(unsigned Reg) {
+  return (Reg >= Hexagon::R0 && Reg <= Hexagon::R31);
+}
+
+bool HexagonMCInstrInfo::isIntRegForSubInst(unsigned Reg) {
+  return ((Reg >= Hexagon::R0 && Reg <= Hexagon::R7) ||
+          (Reg >= Hexagon::R16 && Reg <= Hexagon::R23));
+}
+
 // Return whether the insn is a new-value consumer.
 bool HexagonMCInstrInfo::isNewValue(MCInstrInfo const &MCII,
                                     MCInst const &MCI) {
@@ -203,46 +347,103 @@ bool HexagonMCInstrInfo::isOperandExtended(MCInstrInfo const &MCII,
          OperandNum;
 }
 
-bool HexagonMCInstrInfo::isPacketBegin(MCInst const &MCI) {
-  std::bitset<16> Bits(GetImplicitBits(MCI));
-  return Bits.test(packetBeginIndex);
+bool HexagonMCInstrInfo::isOuterLoop(MCInst const &MCI) {
+  assert(isBundle(MCI));
+  int64_t Flags = MCI.getOperand(0).getImm();
+  return (Flags & outerLoopMask) != 0;
 }
 
-bool HexagonMCInstrInfo::isPacketEnd(MCInst const &MCI) {
-  std::bitset<16> Bits(GetImplicitBits(MCI));
-  return Bits.test(packetEndIndex);
+bool HexagonMCInstrInfo::isPredicated(MCInstrInfo const &MCII,
+                                      MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
+}
+
+bool HexagonMCInstrInfo::isPredicatedTrue(MCInstrInfo const &MCII,
+                                          MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return (
+      !((F >> HexagonII::PredicatedFalsePos) & HexagonII::PredicatedFalseMask));
+}
+
+bool HexagonMCInstrInfo::isPredReg(unsigned Reg) {
+  return (Reg >= Hexagon::P0 && Reg <= Hexagon::P3_0);
 }
 
-// Return whether the insn is a prefix.
 bool HexagonMCInstrInfo::isPrefix(MCInstrInfo const &MCII, MCInst const &MCI) {
   return (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypePREFIX);
 }
 
-// Return whether the insn is solo, i.e., cannot be in a packet.
 bool HexagonMCInstrInfo::isSolo(MCInstrInfo const &MCII, MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
   return ((F >> HexagonII::SoloPos) & HexagonII::SoloMask);
 }
 
-void HexagonMCInstrInfo::resetPacket(MCInst &MCI) {
-  setPacketBegin(MCI, false);
-  setPacketEnd(MCI, false);
+bool HexagonMCInstrInfo::isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::SoloAXPos) & HexagonII::SoloAXMask);
+}
+
+bool HexagonMCInstrInfo::isSoloAin1(MCInstrInfo const &MCII,
+                                    MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::SoloAin1Pos) & HexagonII::SoloAin1Mask);
+}
+
+void HexagonMCInstrInfo::padEndloop(MCInst &MCB) {
+  MCInst Nop;
+  Nop.setOpcode(Hexagon::A2_nop);
+  assert(isBundle(MCB));
+  while ((HexagonMCInstrInfo::isInnerLoop(MCB) &&
+          (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_INNER_SIZE)) ||
+         ((HexagonMCInstrInfo::isOuterLoop(MCB) &&
+           (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_OUTER_SIZE))))
+    MCB.addOperand(MCOperand::createInst(new MCInst(Nop)));
+}
+
+bool HexagonMCInstrInfo::prefersSlot3(MCInstrInfo const &MCII,
+                                      MCInst const &MCI) {
+  if (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCR)
+    return false;
+
+  unsigned SchedClass = HexagonMCInstrInfo::getDesc(MCII, MCI).getSchedClass();
+  switch (SchedClass) {
+  case Hexagon::Sched::ALU32_3op_tc_2_SLOT0123:
+  case Hexagon::Sched::ALU64_tc_2_SLOT23:
+  case Hexagon::Sched::ALU64_tc_3x_SLOT23:
+  case Hexagon::Sched::M_tc_2_SLOT23:
+  case Hexagon::Sched::M_tc_3x_SLOT23:
+  case Hexagon::Sched::S_2op_tc_2_SLOT23:
+  case Hexagon::Sched::S_3op_tc_2_SLOT23:
+  case Hexagon::Sched::S_3op_tc_3x_SLOT23:
+    return true;
+  }
+  return false;
 }
 
-void HexagonMCInstrInfo::SetImplicitBits(MCInst &MCI, std::bitset<16> Bits) {
-  SanityCheckImplicitOperands(MCI);
-  MCI.getOperand(MCI.getNumOperands() - 2).setImm(Bits.to_ulong());
+void HexagonMCInstrInfo::replaceDuplex(MCContext &Context, MCInst &MCB,
+                                       DuplexCandidate Candidate) {
+  assert(Candidate.packetIndexI < MCB.size());
+  assert(Candidate.packetIndexJ < MCB.size());
+  assert(isBundle(MCB));
+  MCInst *Duplex =
+      deriveDuplex(Context, Candidate.iClass,
+                   *MCB.getOperand(Candidate.packetIndexJ).getInst(),
+                   *MCB.getOperand(Candidate.packetIndexI).getInst());
+  assert(Duplex != nullptr);
+  MCB.getOperand(Candidate.packetIndexI).setInst(Duplex);
+  MCB.erase(MCB.begin() + Candidate.packetIndexJ);
 }
 
-void HexagonMCInstrInfo::setPacketBegin(MCInst &MCI, bool f) {
-  std::bitset<16> Bits(GetImplicitBits(MCI));
-  Bits.set(packetBeginIndex, f);
-  SetImplicitBits(MCI, Bits);
+void HexagonMCInstrInfo::setInnerLoop(MCInst &MCI) {
+  assert(isBundle(MCI));
+  MCOperand &Operand = MCI.getOperand(0);
+  Operand.setImm(Operand.getImm() | innerLoopMask);
 }
 
-void HexagonMCInstrInfo::setPacketEnd(MCInst &MCI, bool f) {
-  std::bitset<16> Bits(GetImplicitBits(MCI));
-  Bits.set(packetEndIndex, f);
-  SetImplicitBits(MCI, Bits);
+void HexagonMCInstrInfo::setOuterLoop(MCInst &MCI) {
+  assert(isBundle(MCI));
+  MCOperand &Operand = MCI.getOperand(0);
+  Operand.setImm(Operand.getImm() | outerLoopMask);
 }
 }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
index 082c80d5ac05..09f305f638e2 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
@@ -1,4 +1,4 @@
-//===- HexagonMCInstrInfo.cpp - Hexagon sub-class of MCInst ---------------===//
+//===- HexagonMCInstrInfo.cpp - Utility functions on Hexagon MCInsts ------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,20 +15,47 @@
 #define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H
 
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
-
-#include <bitset>
 
 namespace llvm {
+class MCContext;
 class MCInstrDesc;
 class MCInstrInfo;
 class MCInst;
 class MCOperand;
+class MCSubtargetInfo;
 namespace HexagonII {
 enum class MemAccessSize;
 }
+class DuplexCandidate {
+public:
+  unsigned packetIndexI, packetIndexJ, iClass;
+  DuplexCandidate(unsigned i, unsigned j, unsigned iClass)
+      : packetIndexI(i), packetIndexJ(j), iClass(iClass) {}
+};
 namespace HexagonMCInstrInfo {
-void AppendImplicitOperands(MCInst &MCI);
+size_t const innerLoopOffset = 0;
+int64_t const innerLoopMask = 1 << innerLoopOffset;
+
+size_t const outerLoopOffset = 1;
+int64_t const outerLoopMask = 1 << outerLoopOffset;
+
+size_t const bundleInstructionsOffset = 1;
+
+// Returns the number of instructions in the bundle
+size_t bundleSize(MCInst const &MCI);
+
+// Returns a iterator range of instructions in this bundle
+iterator_range<MCInst::const_iterator> bundleInstructions(MCInst const &MCI);
+
+// Return the extender for instruction at Index or nullptr if none
+MCInst const *extenderForIndex(MCInst const &MCB, size_t Index);
+
+// Create a duplex instruction given the two subinsts
+MCInst *deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0,
+                     MCInst const &inst1);
+
+// Convert this instruction in to a duplex subinst
+MCInst deriveSubInst(MCInst const &Inst);
 
 // Return memory access size
 HexagonII::MemAccessSize getAccessSize(MCInstrInfo const &MCII,
@@ -42,14 +69,26 @@ unsigned short getCExtOpNum(MCInstrInfo const &MCII, MCInst const &MCI);
 
 MCInstrDesc const &getDesc(MCInstrInfo const &MCII, MCInst const &MCI);
 
+// Return which duplex group this instruction belongs to
+unsigned getDuplexCandidateGroup(MCInst const &MI);
+
+// Return a list of all possible instruction duplex combinations
+SmallVector<DuplexCandidate, 8> getDuplexPossibilties(MCInstrInfo const &MCII,
+                                                      MCInst const &MCB);
+
+// Return the index of the extendable operand
+unsigned short getExtendableOp(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return a reference to the extendable operand
+MCOperand const &getExtendableOperand(MCInstrInfo const &MCII,
+                                      MCInst const &MCI);
+
 // Return the implicit alignment of the extendable operand
 unsigned getExtentAlignment(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Return the number of logical bits of the extendable operand
 unsigned getExtentBits(MCInstrInfo const &MCII, MCInst const &MCI);
 
-std::bitset<16> GetImplicitBits(MCInst const &MCI);
-
 // Return the max value that a constant extendable operand can have
 // without being extended.
 int getMaxValue(MCInstrInfo const &MCII, MCInst const &MCI);
@@ -61,27 +100,77 @@ int getMinValue(MCInstrInfo const &MCII, MCInst const &MCI);
 // Return instruction name
 char const *getName(MCInstrInfo const &MCII, MCInst const &MCI);
 
+// Return the operand index for the new value.
+unsigned short getNewValueOp(MCInstrInfo const &MCII, MCInst const &MCI);
+
 // Return the operand that consumes or produces a new value.
-MCOperand const &getNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
+MCOperand const &getNewValueOperand(MCInstrInfo const &MCII, MCInst const &MCI);
+
+int getSubTarget(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Return the Hexagon ISA class for the insn.
 unsigned getType(MCInstrInfo const &MCII, MCInst const &MCI);
 
+/// Return the slots used by the insn.
+unsigned getUnits(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                  MCInst const &MCI);
+
+// Does the packet have an extender for the instruction at Index
+bool hasExtenderForIndex(MCInst const &MCB, size_t Index);
+
+bool hasImmExt(MCInst const &MCI);
+
 // Return whether the instruction is a legal new-value producer.
 bool hasNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
 
+// Return the instruction at Index
+MCInst const &instruction(MCInst const &MCB, size_t Index);
+
+// Returns whether this MCInst is a wellformed bundle
+bool isBundle(MCInst const &MCI);
+
 // Return whether the insn is an actual insn.
 bool isCanon(MCInstrInfo const &MCII, MCInst const &MCI);
 
+// Return the duplex iclass given the two duplex classes
+unsigned iClassOfDuplexPair(unsigned Ga, unsigned Gb);
+
 // Return whether the instruction needs to be constant extended.
 bool isConstExtended(MCInstrInfo const &MCII, MCInst const &MCI);
 
+// Is this double register suitable for use in a duplex subinst
+bool isDblRegForSubInst(unsigned Reg);
+
+// Is this a duplex instruction
+bool isDuplex(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Can these instructions be duplexed
+bool isDuplexPair(MCInst const &MIa, MCInst const &MIb);
+
+// Can these duplex classes be combine in to a duplex instruction
+bool isDuplexPairMatch(unsigned Ga, unsigned Gb);
+
 // Return true if the insn may be extended based on the operand value.
 bool isExtendable(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Return whether the instruction must be always extended.
 bool isExtended(MCInstrInfo const &MCII, MCInst const &MCI);
 
+/// Return whether it is a floating-point insn.
+bool isFloat(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Returns whether this instruction is an immediate extender
+bool isImmext(MCInst const &MCI);
+
+// Returns whether this bundle is an endloop0
+bool isInnerLoop(MCInst const &MCI);
+
+// Is this an integer register
+bool isIntReg(unsigned Reg);
+
+// Is this register suitable for use in a duplex subinst
+bool isIntRegForSubInst(unsigned Reg);
+
 // Return whether the insn is a new-value consumer.
 bool isNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
 
@@ -89,9 +178,22 @@ bool isNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
 bool isOperandExtended(MCInstrInfo const &MCII, MCInst const &MCI,
                        unsigned short OperandNum);
 
-bool isPacketBegin(MCInst const &MCI);
+// Can these two instructions be duplexed
+bool isOrderedDuplexPair(MCInstrInfo const &MCII, MCInst const &MIa,
+                         bool ExtendedA, MCInst const &MIb, bool ExtendedB,
+                         bool bisReversable);
+
+// Returns whether this bundle is an endloop1
+bool isOuterLoop(MCInst const &MCI);
+
+// Return whether this instruction is predicated
+bool isPredicated(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Return whether the predicate sense is true
+bool isPredicatedTrue(MCInstrInfo const &MCII, MCInst const &MCI);
 
-bool isPacketEnd(MCInst const &MCI);
+// Is this a predicate register
+bool isPredReg(unsigned Reg);
 
 // Return whether the insn is a prefix.
 bool isPrefix(MCInstrInfo const &MCII, MCInst const &MCI);
@@ -99,23 +201,31 @@ bool isPrefix(MCInstrInfo const &MCII, MCInst const &MCI);
 // Return whether the insn is solo, i.e., cannot be in a packet.
 bool isSolo(MCInstrInfo const &MCII, MCInst const &MCI);
 
-static const size_t packetBeginIndex = 0;
-static const size_t packetEndIndex = 1;
+/// Return whether the insn can be packaged only with A and X-type insns.
+bool isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI);
 
-void resetPacket(MCInst &MCI);
+/// Return whether the insn can be packaged only with an A-type insn in slot #1.
+bool isSoloAin1(MCInstrInfo const &MCII, MCInst const &MCI);
 
-inline void SanityCheckImplicitOperands(MCInst const &MCI) {
-  assert(MCI.getNumOperands() >= 2 && "At least the two implicit operands");
-  assert(MCI.getOperand(MCI.getNumOperands() - 1).isInst() &&
-         "Implicit bits and flags");
-  assert(MCI.getOperand(MCI.getNumOperands() - 2).isImm() && "Parent pointer");
-}
+// Pad the bundle with nops to satisfy endloop requirements
+void padEndloop(MCInst &MCI);
+
+bool prefersSlot3(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Replace the instructions inside MCB, represented by Candidate
+void replaceDuplex(MCContext &Context, MCInst &MCB, DuplexCandidate Candidate);
+
+// Marks a bundle as endloop0
+void setInnerLoop(MCInst &MCI);
 
-void SetImplicitBits(MCInst &MCI, std::bitset<16> Bits);
+// Marks a bundle as endloop1
+void setOuterLoop(MCInst &MCI);
 
-void setPacketBegin(MCInst &MCI, bool Y);
+// Would duplexing this instruction create a requirement to extend
+bool subInstWouldBeExtended(MCInst const &potentialDuplex);
 
-void setPacketEnd(MCInst &MCI, bool Y);
+// Attempt to find and replace compound pairs 
+void tryCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI);
 }
 }
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
new file mode 100644
index 000000000000..8e70280c1a0d
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
@@ -0,0 +1,237 @@
+//===----- HexagonMCShuffler.cpp - MC bundle shuffling --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the shuffling of insns inside a bundle according to the
+// packet formation rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-shuffle"
+
+#include "Hexagon.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCShuffler.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+    DisableShuffle("disable-hexagon-shuffle", cl::Hidden, cl::init(false),
+                   cl::desc("Disable Hexagon instruction shuffling"));
+
+void HexagonMCShuffler::init(MCInst &MCB) {
+  if (HexagonMCInstrInfo::isBundle(MCB)) {
+    MCInst const *Extender = nullptr;
+    // Copy the bundle for the shuffling.
+    for (const auto &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+      assert(!HexagonMCInstrInfo::getDesc(MCII, *I.getInst()).isPseudo());
+      MCInst *MI = const_cast<MCInst *>(I.getInst());
+
+      if (!HexagonMCInstrInfo::isImmext(*MI)) {
+        append(MI, Extender, HexagonMCInstrInfo::getUnits(MCII, STI, *MI),
+               false);
+        Extender = nullptr;
+      } else
+        Extender = MI;
+    }
+  }
+
+  BundleFlags = MCB.getOperand(0).getImm();
+}
+
+void HexagonMCShuffler::init(MCInst &MCB, MCInst const *AddMI,
+                             bool bInsertAtFront) {
+  if (HexagonMCInstrInfo::isBundle(MCB)) {
+    if (bInsertAtFront && AddMI)
+      append(AddMI, nullptr, HexagonMCInstrInfo::getUnits(MCII, STI, *AddMI),
+             false);
+    MCInst const *Extender = nullptr;
+    // Copy the bundle for the shuffling.
+    for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+      assert(!HexagonMCInstrInfo::getDesc(MCII, *I.getInst()).isPseudo());
+      MCInst *MI = const_cast<MCInst *>(I.getInst());
+      if (!HexagonMCInstrInfo::isImmext(*MI)) {
+        append(MI, Extender, HexagonMCInstrInfo::getUnits(MCII, STI, *MI),
+               false);
+        Extender = nullptr;
+      } else
+        Extender = MI;
+    }
+    if (!bInsertAtFront && AddMI)
+      append(AddMI, nullptr, HexagonMCInstrInfo::getUnits(MCII, STI, *AddMI),
+             false);
+  }
+
+  BundleFlags = MCB.getOperand(0).getImm();
+}
+
+void HexagonMCShuffler::copyTo(MCInst &MCB) {
+  MCB.clear();
+  MCB.addOperand(MCOperand::createImm(BundleFlags));
+  // Copy the results into the bundle.
+  for (HexagonShuffler::iterator I = begin(); I != end(); ++I) {
+
+    MCInst const *MI = I->getDesc();
+    MCInst const *Extender = I->getExtender();
+    if (Extender)
+      MCB.addOperand(MCOperand::createInst(Extender));
+    MCB.addOperand(MCOperand::createInst(MI));
+  }
+}
+
+bool HexagonMCShuffler::reshuffleTo(MCInst &MCB) {
+  if (shuffle()) {
+    // Copy the results into the bundle.
+    copyTo(MCB);
+  } else
+    DEBUG(MCB.dump());
+
+  return (!getError());
+}
+
+bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                            MCInst &MCB) {
+  HexagonMCShuffler MCS(MCII, STI, MCB);
+
+  if (DisableShuffle)
+    // Ignore if user chose so.
+    return false;
+
+  if (!HexagonMCInstrInfo::bundleSize(MCB)) {
+    // There once was a bundle:
+    //    BUNDLE %D2<imp-def>, %R4<imp-def>, %R5<imp-def>, %D7<imp-def>, ...
+    //      * %D2<def> = IMPLICIT_DEF; flags:
+    //      * %D7<def> = IMPLICIT_DEF; flags:
+    // After the IMPLICIT_DEFs were removed by the asm printer, the bundle
+    // became empty.
+    DEBUG(dbgs() << "Skipping empty bundle");
+    return false;
+  } else if (!HexagonMCInstrInfo::isBundle(MCB)) {
+    DEBUG(dbgs() << "Skipping stand-alone insn");
+    return false;
+  }
+
+  // Reorder the bundle and copy the result.
+  if (!MCS.reshuffleTo(MCB)) {
+    // Unless there is any error, which should not happen at this point.
+    unsigned shuffleError = MCS.getError();
+    switch (shuffleError) {
+    default:
+      llvm_unreachable("unknown error");
+    case HexagonShuffler::SHUFFLE_ERROR_INVALID:
+      llvm_unreachable("invalid packet");
+    case HexagonShuffler::SHUFFLE_ERROR_STORES:
+      llvm_unreachable("too many stores");
+    case HexagonShuffler::SHUFFLE_ERROR_LOADS:
+      llvm_unreachable("too many loads");
+    case HexagonShuffler::SHUFFLE_ERROR_BRANCHES:
+      llvm_unreachable("too many branches");
+    case HexagonShuffler::SHUFFLE_ERROR_NOSLOTS:
+      llvm_unreachable("no suitable slot");
+    case HexagonShuffler::SHUFFLE_ERROR_SLOTS:
+      llvm_unreachable("over-subscribed slots");
+    case HexagonShuffler::SHUFFLE_SUCCESS: // Single instruction case.
+      return true;
+    }
+  }
+
+  return true;
+}
+
+unsigned
+llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                       MCContext &Context, MCInst &MCB,
+                       SmallVector<DuplexCandidate, 8> possibleDuplexes) {
+
+  if (DisableShuffle)
+    return HexagonShuffler::SHUFFLE_SUCCESS;
+
+  if (!HexagonMCInstrInfo::bundleSize(MCB)) {
+    // There once was a bundle:
+    //    BUNDLE %D2<imp-def>, %R4<imp-def>, %R5<imp-def>, %D7<imp-def>, ...
+    //      * %D2<def> = IMPLICIT_DEF; flags:
+    //      * %D7<def> = IMPLICIT_DEF; flags:
+    // After the IMPLICIT_DEFs were removed by the asm printer, the bundle
+    // became empty.
+    DEBUG(dbgs() << "Skipping empty bundle");
+    return HexagonShuffler::SHUFFLE_SUCCESS;
+  } else if (!HexagonMCInstrInfo::isBundle(MCB)) {
+    DEBUG(dbgs() << "Skipping stand-alone insn");
+    return HexagonShuffler::SHUFFLE_SUCCESS;
+  }
+
+  bool doneShuffling = false;
+  unsigned shuffleError;
+  while (possibleDuplexes.size() > 0 && (!doneShuffling)) {
+    // case of Duplex Found
+    DuplexCandidate duplexToTry = possibleDuplexes.pop_back_val();
+    MCInst Attempt(MCB);
+    HexagonMCInstrInfo::replaceDuplex(Context, Attempt, duplexToTry);
+    HexagonMCShuffler MCS(MCII, STI, Attempt); // copy packet to the shuffler
+    if (MCS.size() == 1) {                     // case of one duplex
+      // copy the created duplex in the shuffler to the bundle
+      MCS.copyTo(MCB);
+      doneShuffling = true;
+      return HexagonShuffler::SHUFFLE_SUCCESS;
+    }
+    // try shuffle with this duplex
+    doneShuffling = MCS.reshuffleTo(MCB);
+    shuffleError = MCS.getError();
+
+    if (doneShuffling)
+      break;
+  }
+
+  if (doneShuffling == false) {
+    HexagonMCShuffler MCS(MCII, STI, MCB);
+    doneShuffling = MCS.reshuffleTo(MCB); // shuffle
+    shuffleError = MCS.getError();
+  }
+  if (!doneShuffling)
+    return shuffleError;
+
+  return HexagonShuffler::SHUFFLE_SUCCESS;
+}
+
+bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                            MCInst &MCB, MCInst const *AddMI, int fixupCount) {
+  if (!HexagonMCInstrInfo::isBundle(MCB) || !AddMI)
+    return false;
+
+  // if fixups present, make sure we don't insert too many nops that would
+  // later prevent an extender from being inserted.
+  unsigned int bundleSize = HexagonMCInstrInfo::bundleSize(MCB);
+  if (bundleSize >= HEXAGON_PACKET_SIZE)
+    return false;
+  if (fixupCount >= 2) {
+    return false;
+  } else {
+    if (bundleSize == HEXAGON_PACKET_SIZE - 1 && fixupCount)
+      return false;
+  }
+
+  if (DisableShuffle)
+    return false;
+
+  HexagonMCShuffler MCS(MCII, STI, MCB, AddMI);
+  if (!MCS.reshuffleTo(MCB)) {
+    unsigned shuffleError = MCS.getError();
+    switch (shuffleError) {
+    default:
+      return false;
+    case HexagonShuffler::SHUFFLE_SUCCESS: // single instruction case
+      return true;
+    }
+  }
+
+  return true;
+}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
new file mode 100644
index 000000000000..a21cce1fc240
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
@@ -0,0 +1,65 @@
+//=-- HexagonMCShuffler.h ---------------------------------------------------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This declares the shuffling of insns inside a bundle according to the
+// packet formation rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONMCSHUFFLER_H
+#define HEXAGONMCSHUFFLER_H
+
+#include "MCTargetDesc/HexagonShuffler.h"
+
+namespace llvm {
+
+class MCInst;
+
+// Insn bundle shuffler.
+class HexagonMCShuffler : public HexagonShuffler {
+  bool immext_present;
+  bool duplex_present;
+
+public:
+  HexagonMCShuffler(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                    MCInst &MCB)
+      : HexagonShuffler(MCII, STI) {
+    init(MCB);
+  };
+  HexagonMCShuffler(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                    MCInst &MCB, const MCInst *AddMI,
+                    bool bInsertAtFront = false)
+      : HexagonShuffler(MCII, STI) {
+    init(MCB, AddMI, bInsertAtFront);
+  };
+
+  // Copy reordered bundle to another.
+  void copyTo(MCInst &MCB);
+  // Reorder and copy result to another.
+  bool reshuffleTo(MCInst &MCB);
+
+  bool immextPresent() const { return immext_present; };
+  bool duplexPresent() const { return duplex_present; };
+
+private:
+  void init(MCInst &MCB);
+  void init(MCInst &MCB, const MCInst *AddMI, bool bInsertAtFront = false);
+};
+
+// Invocation of the shuffler.
+bool HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                      MCInst &);
+bool HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                      MCInst &, const MCInst *, int);
+unsigned HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                          MCContext &Context, MCInst &,
+                          SmallVector<DuplexCandidate, 8>);
+}
+
+#endif // HEXAGONMCSHUFFLER_H
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 59395e230fa9..43734ed6ca3f 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -55,7 +55,7 @@ createHexagonMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) {
 }
 
 static MCAsmInfo *createHexagonMCAsmInfo(const MCRegisterInfo &MRI,
-                                         StringRef TT) {
+                                         const Triple &TT) {
   MCAsmInfo *MAI = new HexagonMCAsmInfo(TT);
 
   // VirtualFP = (R30 + #0).
@@ -112,11 +112,11 @@ extern "C" void LLVMInitializeHexagonTargetMC() {
   TargetRegistry::RegisterMCCodeEmitter(TheHexagonTarget,
                                         createHexagonMCCodeEmitter);
 
-  // Register the MC Inst Printer
-  TargetRegistry::RegisterMCInstPrinter(TheHexagonTarget,
-                                        createHexagonMCInstPrinter);
-
   // Register the asm backend
   TargetRegistry::RegisterMCAsmBackend(TheHexagonTarget,
                                        createHexagonAsmBackend);
+
+  // Register the MC Inst Printer
+  TargetRegistry::RegisterMCInstPrinter(TheHexagonTarget,
+                                        createHexagonMCInstPrinter);
 }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
index de63fd271aea..81211cc026db 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -17,6 +17,8 @@
 #include <cstdint>
 
 namespace llvm {
+struct InstrItinerary;
+struct InstrStage;
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
@@ -31,6 +33,8 @@ class raw_pwrite_stream;
 
 extern Target TheHexagonTarget;
 
+extern const InstrStage HexagonStages[];
+
 MCInstrInfo *createHexagonMCInstrInfo();
 
 MCCodeEmitter *createHexagonMCCodeEmitter(MCInstrInfo const &MCII,
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
new file mode 100644
index 000000000000..feaaa4f780d5
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -0,0 +1,385 @@
+//===----- HexagonShuffler.cpp - Instruction bundle shuffling -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the shuffling of insns inside a bundle according to the
+// packet formation rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-shuffle"
+
+#include <algorithm>
+#include <utility>
+#include "Hexagon.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "HexagonShuffler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+// Insn shuffling priority.
+class HexagonBid {
+  // The priority is directly proportional to how restricted the insn is based
+  // on its flexibility to run on the available slots.  So, the fewer slots it
+  // may run on, the higher its priority.
+  enum { MAX = 360360 }; // LCD of 1/2, 1/3, 1/4,... 1/15.
+  unsigned Bid;
+
+public:
+  HexagonBid() : Bid(0){};
+  HexagonBid(unsigned B) { Bid = B ? MAX / countPopulation(B) : 0; };
+
+  // Check if the insn priority is overflowed.
+  bool isSold() const { return (Bid >= MAX); };
+
+  HexagonBid &operator+=(const HexagonBid &B) {
+    Bid += B.Bid;
+    return *this;
+  };
+};
+
+// Slot shuffling allocation.
+class HexagonUnitAuction {
+  HexagonBid Scores[HEXAGON_PACKET_SIZE];
+  // Mask indicating which slot is unavailable.
+  unsigned isSold : HEXAGON_PACKET_SIZE;
+
+public:
+  HexagonUnitAuction() : isSold(0){};
+
+  // Allocate slots.
+  bool bid(unsigned B) {
+    // Exclude already auctioned slots from the bid.
+    unsigned b = B & ~isSold;
+    if (b) {
+      for (unsigned i = 0; i < HEXAGON_PACKET_SIZE; ++i)
+        if (b & (1 << i)) {
+          // Request candidate slots.
+          Scores[i] += HexagonBid(b);
+          isSold |= Scores[i].isSold() << i;
+        }
+      return true;
+      ;
+    } else
+      // Error if the desired slots are already full.
+      return false;
+  };
+};
+
+unsigned HexagonResource::setWeight(unsigned s) {
+  const unsigned SlotWeight = 8;
+  const unsigned MaskWeight = SlotWeight - 1;
+  bool Key = (1 << s) & getUnits();
+
+  // Calculate relative weight of the insn for the given slot, weighing it the
+  // heavier the more restrictive the insn is and the lowest the slots that the
+  // insn may be executed in.
+  Weight =
+      (Key << (SlotWeight * s)) * ((MaskWeight - countPopulation(getUnits()))
+                                   << countTrailingZeros(getUnits()));
+  return (Weight);
+}
+
+HexagonShuffler::HexagonShuffler(MCInstrInfo const &MCII,
+                                 MCSubtargetInfo const &STI)
+    : MCII(MCII), STI(STI) {
+  reset();
+}
+
+void HexagonShuffler::reset() {
+  Packet.clear();
+  BundleFlags = 0;
+  Error = SHUFFLE_SUCCESS;
+}
+
+void HexagonShuffler::append(MCInst const *ID, MCInst const *Extender,
+                             unsigned S, bool X) {
+  HexagonInstr PI(ID, Extender, S, X);
+
+  Packet.push_back(PI);
+}
+
+/// Check that the packet is legal and enforce relative insn order.
+bool HexagonShuffler::check() {
+  // Descriptive slot masks.
+  const unsigned slotSingleLoad = 0x1, slotSingleStore = 0x1, slotOne = 0x2,
+                 slotThree = 0x8, slotFirstJump = 0x8, slotLastJump = 0x4,
+                 slotFirstLoadStore = 0x2, slotLastLoadStore = 0x1;
+  // Highest slots for branches and stores used to keep their original order.
+  unsigned slotJump = slotFirstJump;
+  unsigned slotLoadStore = slotFirstLoadStore;
+  // Number of branches, solo branches, indirect branches.
+  unsigned jumps = 0, jump1 = 0, jumpr = 0;
+  // Number of memory operations, loads, solo loads, stores, solo stores, single
+  // stores.
+  unsigned memory = 0, loads = 0, load0 = 0, stores = 0, store0 = 0, store1 = 0;
+  // Number of duplex insns, solo insns.
+  unsigned duplex = 0, solo = 0;
+  // Number of insns restricting other insns in the packet to A and X types,
+  // which is neither A or X types.
+  unsigned onlyAX = 0, neitherAnorX = 0;
+  // Number of insns restricting other insns in slot #1 to A type.
+  unsigned onlyAin1 = 0;
+  // Number of insns restricting any insn in slot #1, except A2_nop.
+  unsigned onlyNo1 = 0;
+  unsigned xtypeFloat = 0;
+  unsigned pSlot3Cnt = 0;
+  iterator slot3ISJ = end();
+
+  // Collect information from the insns in the packet.
+  for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
+    MCInst const *ID = ISJ->getDesc();
+
+    if (HexagonMCInstrInfo::isSolo(MCII, *ID))
+      solo += !ISJ->isSoloException();
+    else if (HexagonMCInstrInfo::isSoloAX(MCII, *ID))
+      onlyAX += !ISJ->isSoloException();
+    else if (HexagonMCInstrInfo::isSoloAin1(MCII, *ID))
+      onlyAin1 += !ISJ->isSoloException();
+    if (HexagonMCInstrInfo::getType(MCII, *ID) != HexagonII::TypeALU32 &&
+        HexagonMCInstrInfo::getType(MCII, *ID) != HexagonII::TypeXTYPE)
+      ++neitherAnorX;
+    if (HexagonMCInstrInfo::prefersSlot3(MCII, *ID)) {
+      ++pSlot3Cnt;
+      slot3ISJ = ISJ;
+    }
+
+    switch (HexagonMCInstrInfo::getType(MCII, *ID)) {
+    case HexagonII::TypeXTYPE:
+      if (HexagonMCInstrInfo::isFloat(MCII, *ID))
+        ++xtypeFloat;
+      break;
+    case HexagonII::TypeJR:
+      ++jumpr;
+    // Fall-through.
+    case HexagonII::TypeJ:
+      ++jumps;
+      break;
+    case HexagonII::TypeLD:
+      ++loads;
+      ++memory;
+      if (ISJ->Core.getUnits() == slotSingleLoad)
+        ++load0;
+      if (HexagonMCInstrInfo::getDesc(MCII, *ID).isReturn())
+        ++jumps, ++jump1; // DEALLOC_RETURN is of type LD.
+      break;
+    case HexagonII::TypeST:
+      ++stores;
+      ++memory;
+      if (ISJ->Core.getUnits() == slotSingleStore)
+        ++store0;
+      break;
+    case HexagonII::TypeMEMOP:
+      ++loads;
+      ++stores;
+      ++store1;
+      ++memory;
+      break;
+    case HexagonII::TypeNV:
+      ++memory; // NV insns are memory-like.
+      if (HexagonMCInstrInfo::getDesc(MCII, *ID).isBranch())
+        ++jumps, ++jump1;
+      break;
+    case HexagonII::TypeCR:
+    // Legacy conditional branch predicated on a register.
+    case HexagonII::TypeSYSTEM:
+      if (HexagonMCInstrInfo::getDesc(MCII, *ID).mayLoad())
+        ++loads;
+      break;
+    }
+  }
+
+  // Check if the packet is legal.
+  if ((load0 > 1 || store0 > 1) || (duplex > 1 || (duplex && memory)) ||
+      (solo && size() > 1) || (onlyAX && neitherAnorX > 1) ||
+      (onlyAX && xtypeFloat)) {
+    Error = SHUFFLE_ERROR_INVALID;
+    return false;
+  }
+
+  if (jump1 && jumps > 1) {
+    // Error if single branch with another branch.
+    Error = SHUFFLE_ERROR_BRANCHES;
+    return false;
+  }
+
+  // Modify packet accordingly.
+  // TODO: need to reserve slots #0 and #1 for duplex insns.
+  bool bOnlySlot3 = false;
+  for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
+    MCInst const *ID = ISJ->getDesc();
+
+    if (!ISJ->Core.getUnits()) {
+      // Error if insn may not be executed in any slot.
+      Error = SHUFFLE_ERROR_UNKNOWN;
+      return false;
+    }
+
+    // Exclude from slot #1 any insn but A2_nop.
+    if (HexagonMCInstrInfo::getDesc(MCII, *ID).getOpcode() != Hexagon::A2_nop)
+      if (onlyNo1)
+        ISJ->Core.setUnits(ISJ->Core.getUnits() & ~slotOne);
+
+    // Exclude from slot #1 any insn but A-type.
+    if (HexagonMCInstrInfo::getType(MCII, *ID) != HexagonII::TypeALU32)
+      if (onlyAin1)
+        ISJ->Core.setUnits(ISJ->Core.getUnits() & ~slotOne);
+
+    // Branches must keep the original order.
+    if (HexagonMCInstrInfo::getDesc(MCII, *ID).isBranch() ||
+        HexagonMCInstrInfo::getDesc(MCII, *ID).isCall())
+      if (jumps > 1) {
+        if (jumpr || slotJump < slotLastJump) {
+          // Error if indirect branch with another branch or
+          // no more slots available for branches.
+          Error = SHUFFLE_ERROR_BRANCHES;
+          return false;
+        }
+        // Pin the branch to the highest slot available to it.
+        ISJ->Core.setUnits(ISJ->Core.getUnits() & slotJump);
+        // Update next highest slot available to branches.
+        slotJump >>= 1;
+      }
+
+    // A single load must use slot #0.
+    if (HexagonMCInstrInfo::getDesc(MCII, *ID).mayLoad()) {
+      if (loads == 1 && loads == memory)
+        // Pin the load to slot #0.
+        ISJ->Core.setUnits(ISJ->Core.getUnits() & slotSingleLoad);
+    }
+
+    // A single store must use slot #0.
+    if (HexagonMCInstrInfo::getDesc(MCII, *ID).mayStore()) {
+      if (!store0) {
+        if (stores == 1)
+          ISJ->Core.setUnits(ISJ->Core.getUnits() & slotSingleStore);
+        else if (stores > 1) {
+          if (slotLoadStore < slotLastLoadStore) {
+            // Error if no more slots available for stores.
+            Error = SHUFFLE_ERROR_STORES;
+            return false;
+          }
+          // Pin the store to the highest slot available to it.
+          ISJ->Core.setUnits(ISJ->Core.getUnits() & slotLoadStore);
+          // Update the next highest slot available to stores.
+          slotLoadStore >>= 1;
+        }
+      }
+      if (store1 && stores > 1) {
+        // Error if a single store with another store.
+        Error = SHUFFLE_ERROR_STORES;
+        return false;
+      }
+    }
+
+    // flag if an instruction can only be executed in slot 3
+    if (ISJ->Core.getUnits() == slotThree)
+      bOnlySlot3 = true;
+
+    if (!ISJ->Core.getUnits()) {
+      // Error if insn may not be executed in any slot.
+      Error = SHUFFLE_ERROR_NOSLOTS;
+      return false;
+    }
+  }
+
+  bool validateSlots = true;
+  if (bOnlySlot3 == false && pSlot3Cnt == 1 && slot3ISJ != end()) {
+    // save off slot mask of instruction marked with A_PREFER_SLOT3
+    // and then pin it to slot #3
+    unsigned saveUnits = slot3ISJ->Core.getUnits();
+    slot3ISJ->Core.setUnits(saveUnits & slotThree);
+
+    HexagonUnitAuction AuctionCore;
+    std::sort(begin(), end(), HexagonInstr::lessCore);
+
+    // see if things ok with that instruction being pinned to slot #3
+    bool bFail = false;
+    for (iterator I = begin(); I != end() && bFail != true; ++I)
+      if (!AuctionCore.bid(I->Core.getUnits()))
+        bFail = true;
+
+    // if yes, great, if not then restore original slot mask
+    if (!bFail)
+      validateSlots = false; // all good, no need to re-do auction
+    else
+      for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
+        MCInst const *ID = ISJ->getDesc();
+        if (HexagonMCInstrInfo::prefersSlot3(MCII, *ID))
+          ISJ->Core.setUnits(saveUnits);
+      }
+  }
+
+  // Check if any slot, core, is over-subscribed.
+  // Verify the core slot subscriptions.
+  if (validateSlots) {
+    HexagonUnitAuction AuctionCore;
+
+    std::sort(begin(), end(), HexagonInstr::lessCore);
+
+    for (iterator I = begin(); I != end(); ++I)
+      if (!AuctionCore.bid(I->Core.getUnits())) {
+        Error = SHUFFLE_ERROR_SLOTS;
+        return false;
+      }
+  }
+
+  Error = SHUFFLE_SUCCESS;
+  return true;
+}
+
+bool HexagonShuffler::shuffle() {
+  if (size() > HEXAGON_PACKET_SIZE) {
+    // Ignore a packet with with more than what a packet can hold
+    // or with compound or duplex insns for now.
+    Error = SHUFFLE_ERROR_INVALID;
+    return false;
+  }
+
+  // Check and prepare packet.
+  if (size() > 1 && check())
+    // Reorder the handles for each slot.
+    for (unsigned nSlot = 0, emptySlots = 0; nSlot < HEXAGON_PACKET_SIZE;
+         ++nSlot) {
+      iterator ISJ, ISK;
+      unsigned slotSkip, slotWeight;
+
+      // Prioritize the handles considering their restrictions.
+      for (ISJ = ISK = Packet.begin(), slotSkip = slotWeight = 0;
+           ISK != Packet.end(); ++ISK, ++slotSkip)
+        if (slotSkip < nSlot - emptySlots)
+          // Note which handle to begin at.
+          ++ISJ;
+        else
+          // Calculate the weight of the slot.
+          slotWeight += ISK->Core.setWeight(HEXAGON_PACKET_SIZE - nSlot - 1);
+
+      if (slotWeight)
+        // Sort the packet, favoring source order,
+        // beginning after the previous slot.
+        std::sort(ISJ, Packet.end());
+      else
+        // Skip unused slot.
+        ++emptySlots;
+    }
+
+  for (iterator ISJ = begin(); ISJ != end(); ++ISJ)
+    DEBUG(dbgs().write_hex(ISJ->Core.getUnits());
+          dbgs() << ':'
+                 << HexagonMCInstrInfo::getDesc(MCII, *ISJ->getDesc())
+                        .getOpcode();
+          dbgs() << '\n');
+  DEBUG(dbgs() << '\n');
+
+  return (!getError());
+}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
new file mode 100644
index 000000000000..9218fd3eb070
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
@@ -0,0 +1,139 @@
+//===----- HexagonShuffler.h - Instruction bundle shuffling ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the shuffling of insns inside a bundle according to the
+// packet formation rules of the Hexagon ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONSHUFFLER_H
+#define HEXAGONSHUFFLER_H
+
+#include "Hexagon.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCInstrInfo.h"
+
+using namespace llvm;
+
+namespace llvm {
+// Insn resources.
+class HexagonResource {
+  // Mask of the slots or units that may execute the insn and
+  // the weight or priority that the insn requires to be assigned a slot.
+  unsigned Slots, Weight;
+
+public:
+  HexagonResource(unsigned s) { setUnits(s); };
+
+  void setUnits(unsigned s) {
+    Slots = s & ~(-1 << HEXAGON_PACKET_SIZE);
+    setWeight(s);
+  };
+  unsigned setWeight(unsigned s);
+
+  unsigned getUnits() const { return (Slots); };
+  unsigned getWeight() const { return (Weight); };
+
+  // Check if the resources are in ascending slot order.
+  static bool lessUnits(const HexagonResource &A, const HexagonResource &B) {
+    return (countPopulation(A.getUnits()) < countPopulation(B.getUnits()));
+  };
+  // Check if the resources are in ascending weight order.
+  static bool lessWeight(const HexagonResource &A, const HexagonResource &B) {
+    return (A.getWeight() < B.getWeight());
+  };
+};
+
+// Handle to an insn used by the shuffling algorithm.
+class HexagonInstr {
+  friend class HexagonShuffler;
+
+  MCInst const *ID;
+  MCInst const *Extender;
+  HexagonResource Core;
+  bool SoloException;
+
+public:
+  HexagonInstr(MCInst const *id, MCInst const *Extender, unsigned s,
+               bool x = false)
+      : ID(id), Extender(Extender), Core(s), SoloException(x){};
+
+  MCInst const *getDesc() const { return (ID); };
+
+  MCInst const *getExtender() const { return Extender; }
+
+  unsigned isSoloException() const { return (SoloException); };
+
+  // Check if the handles are in ascending order for shuffling purposes.
+  bool operator<(const HexagonInstr &B) const {
+    return (HexagonResource::lessWeight(B.Core, Core));
+  };
+  // Check if the handles are in ascending order by core slots.
+  static bool lessCore(const HexagonInstr &A, const HexagonInstr &B) {
+    return (HexagonResource::lessUnits(A.Core, B.Core));
+  };
+};
+
+// Bundle shuffler.
+class HexagonShuffler {
+  typedef SmallVector<HexagonInstr, HEXAGON_PRESHUFFLE_PACKET_SIZE>
+      HexagonPacket;
+
+  // Insn handles in a bundle.
+  HexagonPacket Packet;
+
+  // Shuffling error code.
+  unsigned Error;
+
+protected:
+  int64_t BundleFlags;
+  MCInstrInfo const &MCII;
+  MCSubtargetInfo const &STI;
+
+public:
+  typedef HexagonPacket::iterator iterator;
+
+  enum {
+    SHUFFLE_SUCCESS = 0,    ///< Successful operation.
+    SHUFFLE_ERROR_INVALID,  ///< Invalid bundle.
+    SHUFFLE_ERROR_STORES,   ///< No free slots for store insns.
+    SHUFFLE_ERROR_LOADS,    ///< No free slots for load insns.
+    SHUFFLE_ERROR_BRANCHES, ///< No free slots for branch insns.
+    SHUFFLE_ERROR_NOSLOTS,  ///< No free slots for other insns.
+    SHUFFLE_ERROR_SLOTS,    ///< Over-subscribed slots.
+    SHUFFLE_ERROR_UNKNOWN   ///< Unknown error.
+  };
+
+  explicit HexagonShuffler(MCInstrInfo const &MCII, MCSubtargetInfo const &STI);
+
+  // Reset to initial state.
+  void reset();
+  // Check if the bundle may be validly shuffled.
+  bool check();
+  // Reorder the insn handles in the bundle.
+  bool shuffle();
+
+  unsigned size() const { return (Packet.size()); };
+
+  iterator begin() { return (Packet.begin()); };
+  iterator end() { return (Packet.end()); };
+
+  // Add insn handle to the bundle .
+  void append(MCInst const *ID, MCInst const *Extender, unsigned S,
+              bool X = false);
+
+  // Return the error code for the last check or shuffling of the bundle.
+  void setError(unsigned Err) { Error = Err; };
+  unsigned getError() const { return (Error); };
+};
+}
+
+#endif // HEXAGONSHUFFLER_H
diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
index 6c43d97837ca..be6d1a84a377 100644
--- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
+++ b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
@@ -39,7 +39,7 @@ void MSP430InstPrinter::printPCRelImmOperand(const MCInst *MI, unsigned OpNo,
     O << Op.getImm();
   else {
     assert(Op.isExpr() && "unknown pcrel immediate operand");
-    O << *Op.getExpr();
+    Op.getExpr()->print(O, &MAI);
   }
 }
 
@@ -53,7 +53,8 @@ void MSP430InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     O << '#' << Op.getImm();
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
-    O << '#' << *Op.getExpr();
+    O << '#';
+    Op.getExpr()->print(O, &MAI);
   }
 }
 
@@ -75,7 +76,7 @@ void MSP430InstPrinter::printSrcMemOperand(const MCInst *MI, unsigned OpNo,
     O << '&';
 
   if (Disp.isExpr())
-    O << *Disp.getExpr();
+    Disp.getExpr()->print(O, &MAI);
   else {
     assert(Disp.isImm() && "Expected immediate in displacement field");
     O << Disp.getImm();
diff --git a/lib/Target/MSP430/MCTargetDesc/LLVMBuild.txt b/lib/Target/MSP430/MCTargetDesc/LLVMBuild.txt
index b8f3d02ab4d8..a305b2db8683 100644
--- a/lib/Target/MSP430/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/MSP430/MCTargetDesc/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = MSP430Desc
 parent = MSP430
-required_libraries = MC MSP430AsmPrinter MSP430Info
+required_libraries = MC MSP430AsmPrinter MSP430Info Support
 add_to_library_groups = MSP430
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
index df1aa1a41f19..c26b3081dbc3 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
@@ -12,12 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "MSP430MCAsmInfo.h"
-#include "llvm/ADT/StringRef.h"
 using namespace llvm;
 
 void MSP430MCAsmInfo::anchor() { }
 
-MSP430MCAsmInfo::MSP430MCAsmInfo(StringRef TT) {
+MSP430MCAsmInfo::MSP430MCAsmInfo(const Triple &TT) {
   PointerSize = CalleeSaveStackSlotSize = 2;
 
   CommentString = ";";
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
index 2c9532d321e4..ff5b0b6d858c 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
@@ -17,12 +17,12 @@
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
-  class StringRef;
+  class Triple;
 
   class MSP430MCAsmInfo : public MCAsmInfoELF {
     void anchor() override;
   public:
-    explicit MSP430MCAsmInfo(StringRef TT);
+    explicit MSP430MCAsmInfo(const Triple &TT);
   };
 
 } // namespace llvm
diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp
index a99c9a3e2374..4342c10a1bf2 100644
--- a/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -75,7 +75,7 @@ void MSP430AsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     O << MO.getImm();
     return;
   case MachineOperand::MO_MachineBasicBlock:
-    O << *MO.getMBB()->getSymbol();
+    MO.getMBB()->getSymbol()->print(O, MAI);
     return;
   case MachineOperand::MO_GlobalAddress: {
     bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
@@ -92,7 +92,7 @@ void MSP430AsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     if (Offset)
       O << '(' << Offset << '+';
 
-    O << *getSymbol(MO.getGlobal());
+    getSymbol(MO.getGlobal())->print(O, MAI);
 
     if (Offset)
       O << ')';
diff --git a/lib/Target/MSP430/MSP430MCInstLower.cpp b/lib/Target/MSP430/MSP430MCInstLower.cpp
index b039778d96c4..54154a8afac1 100644
--- a/lib/Target/MSP430/MSP430MCInstLower.cpp
+++ b/lib/Target/MSP430/MSP430MCInstLower.cpp
@@ -96,7 +96,7 @@ MCOperand MSP430MCInstLower::
 LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const {
   // FIXME: We would like an efficient form for this, so we don't have to do a
   // lot of extra uniquing.
-  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
 
   switch (MO.getTargetFlags()) {
   default: llvm_unreachable("Unknown target flag on GV operand");
@@ -104,8 +104,8 @@ LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const {
   }
 
   if (!MO.isJTI() && MO.getOffset())
-    Expr = MCBinaryExpr::CreateAdd(Expr,
-                                   MCConstantExpr::Create(MO.getOffset(), Ctx),
+    Expr = MCBinaryExpr::createAdd(Expr,
+                                   MCConstantExpr::create(MO.getOffset(), Ctx),
                                    Ctx);
   return MCOperand::createExpr(Expr);
 }
@@ -130,7 +130,7 @@ void MSP430MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
       MCOp = MCOperand::createImm(MO.getImm());
       break;
     case MachineOperand::MO_MachineBasicBlock:
-      MCOp = MCOperand::createExpr(MCSymbolRefExpr::Create(
+      MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
                          MO.getMBB()->getSymbol(), Ctx));
       break;
     case MachineOperand::MO_GlobalAddress:
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index aade12b3046a..9c054e5ac231 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -43,7 +43,7 @@ class MCInstrInfo;
 namespace {
 class MipsAssemblerOptions {
 public:
-  MipsAssemblerOptions(uint64_t Features_) : 
+  MipsAssemblerOptions(const FeatureBitset &Features_) :
     ATReg(1), Reorder(true), Macro(true), Features(Features_) {}
 
   MipsAssemblerOptions(const MipsAssemblerOptions *Opts) {
@@ -70,8 +70,8 @@ public:
   void setMacro() { Macro = true; }
   void setNoMacro() { Macro = false; }
 
-  uint64_t getFeatures() const { return Features; }
-  void setFeatures(uint64_t Features_) { Features = Features_; }
+  const FeatureBitset &getFeatures() const { return Features; }
+  void setFeatures(const FeatureBitset &Features_) { Features = Features_; }
 
   // Set of features that are either architecture features or referenced
   // by them (e.g.: FeatureNaN2008 implied by FeatureMips32r6).
@@ -84,7 +84,7 @@ private:
   unsigned ATReg;
   bool Reorder;
   bool Macro;
-  uint64_t Features;
+  FeatureBitset Features;
 };
 }
 
@@ -247,6 +247,8 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool parseSetFpDirective();
   bool parseSetPopDirective();
   bool parseSetPushDirective();
+  bool parseSetSoftFloatDirective();
+  bool parseSetHardFloatDirective();
 
   bool parseSetAssignment();
 
@@ -325,23 +327,23 @@ class MipsAsmParser : public MCTargetAsmParser {
     STI.setFeatureBits(FeatureBits);
     setAvailableFeatures(
         ComputeAvailableFeatures(STI.ToggleFeature(ArchFeature)));
-    AssemblerOptions.back()->setFeatures(getAvailableFeatures());
+    AssemblerOptions.back()->setFeatures(STI.getFeatureBits());
   }
 
   void setFeatureBits(uint64_t Feature, StringRef FeatureString) {
     if (!(STI.getFeatureBits()[Feature])) {
       setAvailableFeatures(
           ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
+      AssemblerOptions.back()->setFeatures(STI.getFeatureBits());
     }
-    AssemblerOptions.back()->setFeatures(getAvailableFeatures());
   }
 
   void clearFeatureBits(uint64_t Feature, StringRef FeatureString) {
     if (STI.getFeatureBits()[Feature]) {
       setAvailableFeatures(
           ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
+      AssemblerOptions.back()->setFeatures(STI.getFeatureBits());
     }
-    AssemblerOptions.back()->setFeatures(getAvailableFeatures());
   }
 
 public:
@@ -367,11 +369,11 @@ public:
     
     // Remember the initial assembler options. The user can not modify these.
     AssemblerOptions.push_back(
-                     make_unique<MipsAssemblerOptions>(getAvailableFeatures()));
+        llvm::make_unique<MipsAssemblerOptions>(STI.getFeatureBits()));
     
     // Create an assembler options environment for the user to modify.
     AssemblerOptions.push_back(
-                     make_unique<MipsAssemblerOptions>(getAvailableFeatures()));
+        llvm::make_unique<MipsAssemblerOptions>(STI.getFeatureBits()));
 
     getTargetStreamer().updateABIInfo(*this);
 
@@ -1946,10 +1948,10 @@ void MipsAsmParser::expandLoadAddressSym(
   unsigned RegNo = DstRegOp.getReg();
   const MCSymbolRefExpr *Symbol = cast<MCSymbolRefExpr>(SymOp.getExpr());
   const MCSymbolRefExpr *HiExpr =
-      MCSymbolRefExpr::Create(Symbol->getSymbol().getName(),
+      MCSymbolRefExpr::create(Symbol->getSymbol().getName(),
                               MCSymbolRefExpr::VK_Mips_ABS_HI, getContext());
   const MCSymbolRefExpr *LoExpr =
-      MCSymbolRefExpr::Create(Symbol->getSymbol().getName(),
+      MCSymbolRefExpr::create(Symbol->getSymbol().getName(),
                               MCSymbolRefExpr::VK_Mips_ABS_LO, getContext());
   if (!Is32BitSym) {
     // If it's a 64-bit architecture, expand to:
@@ -1960,10 +1962,10 @@ void MipsAsmParser::expandLoadAddressSym(
     //             dsll d,d,16
     //             ori  d,d,lo16(sym)
     const MCSymbolRefExpr *HighestExpr =
-        MCSymbolRefExpr::Create(Symbol->getSymbol().getName(),
+        MCSymbolRefExpr::create(Symbol->getSymbol().getName(),
                                 MCSymbolRefExpr::VK_Mips_HIGHEST, getContext());
     const MCSymbolRefExpr *HigherExpr =
-        MCSymbolRefExpr::Create(Symbol->getSymbol().getName(),
+        MCSymbolRefExpr::create(Symbol->getSymbol().getName(),
                                 MCSymbolRefExpr::VK_Mips_HIGHER, getContext());
 
     tmpInst.setOpcode(Mips::LUi);
@@ -2102,7 +2104,7 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
   else {
     if (ExprOffset->getKind() == MCExpr::SymbolRef) {
       SR = static_cast<const MCSymbolRefExpr *>(ExprOffset);
-      const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::Create(
+      const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::create(
           SR->getSymbol().getName(), MCSymbolRefExpr::VK_Mips_ABS_HI,
           getContext());
       TempInst.addOperand(MCOperand::createExpr(HiExpr));
@@ -2133,7 +2135,7 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
     TempInst.addOperand(MCOperand::createImm(LoOffset));
   else {
     if (ExprOffset->getKind() == MCExpr::SymbolRef) {
-      const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::Create(
+      const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::create(
           SR->getSymbol().getName(), MCSymbolRefExpr::VK_Mips_ABS_LO,
           getContext());
       TempInst.addOperand(MCOperand::createExpr(LoExpr));
@@ -2505,7 +2507,7 @@ bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
     MCSymbol *Sym = getContext().getOrCreateSymbol("$" + Identifier);
     // Otherwise create a symbol reference.
     const MCExpr *Res =
-        MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, getContext());
+        MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
 
     Operands.push_back(MipsOperand::CreateImm(Res, S, E, *this));
     return false;
@@ -2565,14 +2567,14 @@ const MCExpr *MipsAsmParser::evaluateRelocExpr(const MCExpr *Expr,
     default:
       report_fatal_error("unsupported reloc value");
     }
-    return MCConstantExpr::Create(Val, getContext());
+    return MCConstantExpr::create(Val, getContext());
   }
 
   if (const MCSymbolRefExpr *MSRE = dyn_cast<MCSymbolRefExpr>(Expr)) {
     // It's a symbol, create a symbolic expression from the symbol.
     StringRef Symbol = MSRE->getSymbol().getName();
     MCSymbolRefExpr::VariantKind VK = getVariantKind(RelocStr);
-    Res = MCSymbolRefExpr::Create(Symbol, VK, getContext());
+    Res = MCSymbolRefExpr::create(Symbol, VK, getContext());
     return Res;
   }
 
@@ -2581,17 +2583,17 @@ const MCExpr *MipsAsmParser::evaluateRelocExpr(const MCExpr *Expr,
 
     // Try to create target expression.
     if (MipsMCExpr::isSupportedBinaryExpr(VK, BE))
-      return MipsMCExpr::Create(VK, Expr, getContext());
+      return MipsMCExpr::create(VK, Expr, getContext());
 
     const MCExpr *LExp = evaluateRelocExpr(BE->getLHS(), RelocStr);
     const MCExpr *RExp = evaluateRelocExpr(BE->getRHS(), RelocStr);
-    Res = MCBinaryExpr::Create(BE->getOpcode(), LExp, RExp, getContext());
+    Res = MCBinaryExpr::create(BE->getOpcode(), LExp, RExp, getContext());
     return Res;
   }
 
   if (const MCUnaryExpr *UN = dyn_cast<MCUnaryExpr>(Expr)) {
     const MCExpr *UnExp = evaluateRelocExpr(UN->getSubExpr(), RelocStr);
-    Res = MCUnaryExpr::Create(UN->getOpcode(), UnExp, getContext());
+    Res = MCUnaryExpr::create(UN->getOpcode(), UnExp, getContext());
     return Res;
   }
   // Just return the original expression.
@@ -2779,7 +2781,7 @@ MipsAsmParser::parseMemOperand(OperandVector &Operands) {
   Parser.Lex(); // Eat the ')' token.
 
   if (!IdVal)
-    IdVal = MCConstantExpr::Create(0, getContext());
+    IdVal = MCConstantExpr::create(0, getContext());
 
   // Replace the register operand with the memory operand.
   std::unique_ptr<MipsOperand> op(
@@ -2790,10 +2792,10 @@ MipsAsmParser::parseMemOperand(OperandVector &Operands) {
   // Add the memory operand.
   if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(IdVal)) {
     int64_t Imm;
-    if (IdVal->EvaluateAsAbsolute(Imm))
-      IdVal = MCConstantExpr::Create(Imm, getContext());
+    if (IdVal->evaluateAsAbsolute(Imm))
+      IdVal = MCConstantExpr::create(Imm, getContext());
     else if (BE->getLHS()->getKind() != MCExpr::SymbolRef)
-      IdVal = MCBinaryExpr::Create(BE->getOpcode(), BE->getRHS(), BE->getLHS(),
+      IdVal = MCBinaryExpr::create(BE->getOpcode(), BE->getRHS(), BE->getLHS(),
                                    getContext());
   }
 
@@ -3010,7 +3012,7 @@ MipsAsmParser::parseInvNum(OperandVector &Operands) {
   int64_t Val = MCE->getValue();
   SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
   Operands.push_back(MipsOperand::CreateImm(
-      MCConstantExpr::Create(0 - Val, getContext()), S, E, *this));
+      MCConstantExpr::create(0 - Val, getContext()), S, E, *this));
   return MatchOperand_Success;
 }
 
@@ -3034,7 +3036,7 @@ MipsAsmParser::parseLSAImm(OperandVector &Operands) {
     return MatchOperand_ParseFail;
 
   int64_t Val;
-  if (!Expr->EvaluateAsAbsolute(Val)) {
+  if (!Expr->evaluateAsAbsolute(Val)) {
     Error(S, "expected immediate value");
     return MatchOperand_ParseFail;
   }
@@ -3601,7 +3603,9 @@ bool MipsAsmParser::parseSetPopDirective() {
     return reportParseError(Loc, ".set pop with no .set push");
 
   AssemblerOptions.pop_back();
-  setAvailableFeatures(AssemblerOptions.back()->getFeatures());
+  setAvailableFeatures(
+      ComputeAvailableFeatures(AssemblerOptions.back()->getFeatures()));
+  STI.setFeatureBits(AssemblerOptions.back()->getFeatures());
 
   getTargetStreamer().emitDirectiveSetPop();
   return false;
@@ -3621,6 +3625,28 @@ bool MipsAsmParser::parseSetPushDirective() {
   return false;
 }
 
+bool MipsAsmParser::parseSetSoftFloatDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return reportParseError("unexpected token, expected end of statement");
+
+  setFeatureBits(Mips::FeatureSoftFloat, "soft-float");
+  getTargetStreamer().emitDirectiveSetSoftFloat();
+  return false;
+}
+
+bool MipsAsmParser::parseSetHardFloatDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return reportParseError("unexpected token, expected end of statement");
+
+  clearFeatureBits(Mips::FeatureSoftFloat, "soft-float");
+  getTargetStreamer().emitDirectiveSetHardFloat();
+  return false;
+}
+
 bool MipsAsmParser::parseSetAssignment() {
   StringRef Name;
   const MCExpr *Value;
@@ -3649,7 +3675,9 @@ bool MipsAsmParser::parseSetMips0Directive() {
     return reportParseError("unexpected token, expected end of statement");
 
   // Reset assembler options to their initial values.
-  setAvailableFeatures(AssemblerOptions.front()->getFeatures());
+  setAvailableFeatures(
+      ComputeAvailableFeatures(AssemblerOptions.front()->getFeatures()));
+  STI.setFeatureBits(AssemblerOptions.front()->getFeatures());
   AssemblerOptions.back()->setFeatures(AssemblerOptions.front()->getFeatures());
 
   getTargetStreamer().emitDirectiveSetMips0();
@@ -3985,6 +4013,10 @@ bool MipsAsmParser::parseDirectiveSet() {
     return parseSetMsaDirective();
   } else if (Tok.getString() == "nomsa") {
     return parseSetNoMsaDirective();
+  } else if (Tok.getString() == "softfloat") {
+    return parseSetSoftFloatDirective();
+  } else if (Tok.getString() == "hardfloat") {
+    return parseSetHardFloatDirective();
   } else {
     // It is just an identifier, look for an assignment.
     parseSetAssignment();
@@ -4286,7 +4318,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
         reportParseError("expected number after comma");
         return false;
       }
-      if (!DummyNumber->EvaluateAsAbsolute(DummyNumberVal)) {
+      if (!DummyNumber->evaluateAsAbsolute(DummyNumberVal)) {
         reportParseError("expected an absolute expression after comma");
         return false;
       }
@@ -4366,7 +4398,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
       return false;
     }
 
-    if (!FrameSize->EvaluateAsAbsolute(FrameSizeVal)) {
+    if (!FrameSize->evaluateAsAbsolute(FrameSizeVal)) {
       reportParseError("frame size not an absolute expression");
       return false;
     }
@@ -4427,7 +4459,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
       return false;
     }
 
-    if (!BitMask->EvaluateAsAbsolute(BitMaskVal)) {
+    if (!BitMask->evaluateAsAbsolute(BitMaskVal)) {
       reportParseError("bitmask not an absolute expression");
       return false;
     }
@@ -4448,7 +4480,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
       return false;
     }
 
-    if (!FrameOffset->EvaluateAsAbsolute(FrameOffsetVal)) {
+    if (!FrameOffset->evaluateAsAbsolute(FrameOffsetVal)) {
       reportParseError("frame offset not an absolute expression");
       return false;
     }
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index eb97c93ac196..c8629b5d7bd2 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -47,6 +47,8 @@ public:
 
   bool isGP64() const { return STI.getFeatureBits()[Mips::FeatureGP64Bit]; }
 
+  bool hasCnMips() const { return STI.getFeatureBits()[Mips::FeatureCnMips]; }
+
   bool hasCOP3() const {
     // Only present in MIPS-I and MIPS-II
     return !hasMips32() && !hasMips3();
@@ -889,6 +891,16 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
     }
   }
 
+  if (hasCnMips()) {
+    DEBUG(dbgs() << "Trying CnMips table (32-bit opcodes):\n");
+    Result = decodeInstruction(DecoderTableCnMips32, Instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
   if (isGP64()) {
     DEBUG(dbgs() << "Trying Mips64 (GPR64) table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips6432, Instr, Insn,
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index e80a47b90142..a5637b16b636 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -122,7 +122,8 @@ void MipsInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
   }
 }
 
-static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
+static void printExpr(const MCExpr *Expr, const MCAsmInfo *MAI,
+                      raw_ostream &OS) {
   int Offset = 0;
   const MCSymbolRefExpr *SRE;
 
@@ -132,7 +133,7 @@ static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
     assert(SRE && CE && "Binary expression must be sym+const.");
     Offset = CE->getValue();
   } else if (const MipsMCExpr *ME = dyn_cast<MipsMCExpr>(Expr)) {
-    ME->print(OS);
+    ME->print(OS, MAI);
     return;
   } else
     SRE = cast<MCSymbolRefExpr>(Expr);
@@ -170,7 +171,7 @@ static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
   case MCSymbolRefExpr::VK_Mips_PCREL_LO16: OS << "%pcrel_lo("; break;
   }
 
-  OS << SRE->getSymbol();
+  SRE->getSymbol().print(OS, MAI);
 
   if (Offset) {
     if (Offset > 0)
@@ -199,7 +200,7 @@ void MipsInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 
   assert(Op.isExpr() && "unknown operand kind in printOperand");
-  printExpr(Op.getExpr(), O);
+  printExpr(Op.getExpr(), &MAI, O);
 }
 
 void MipsInstPrinter::printUnsignedImm(const MCInst *MI, int opNum,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
index b1f7c2f22594..bf8f7d12880d 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp
@@ -99,6 +99,10 @@ unsigned MipsABIInfo::GetFramePtr() const {
   return ArePtrs64bit() ? Mips::FP_64 : Mips::FP;
 }
 
+unsigned MipsABIInfo::GetBasePtr() const {
+  return ArePtrs64bit() ? Mips::S7_64 : Mips::S7;
+}
+
 unsigned MipsABIInfo::GetNullPtr() const {
   return ArePtrs64bit() ? Mips::ZERO_64 : Mips::ZERO;
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
index 9a6ba9467659..d20dc9037951 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
@@ -65,6 +65,7 @@ public:
 
   unsigned GetStackPtr() const;
   unsigned GetFramePtr() const;
+  unsigned GetBasePtr() const;
   unsigned GetNullPtr() const;
   unsigned GetPtrAdduOp() const;
   unsigned GetPtrAddiuOp() const;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 8d9e3e31105e..982a7f54e825 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -12,10 +12,10 @@
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCELF.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <list>
@@ -46,7 +46,7 @@ struct MipsRelocationEntry {
 
     unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                           bool IsPCRel) const override;
-    bool needsRelocateWithSymbol(const MCSymbolData &SD,
+    bool needsRelocateWithSymbol(const MCSymbol &Sym,
                                  unsigned Type) const override;
     virtual void sortRelocs(const MCAssembler &Asm,
                             std::vector<ELFRelocationEntry> &Relocs) override;
@@ -65,181 +65,134 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
                                            const MCFixup &Fixup,
                                            bool IsPCRel) const {
   // determine the type of the relocation
-  unsigned Type = (unsigned)ELF::R_MIPS_NONE;
   unsigned Kind = (unsigned)Fixup.getKind();
 
   switch (Kind) {
-  default:
-    llvm_unreachable("invalid fixup kind!");
   case Mips::fixup_Mips_32:
   case FK_Data_4:
-    Type = ELF::R_MIPS_32;
-    break;
+    return IsPCRel ? ELF::R_MIPS_PC32 : ELF::R_MIPS_32;
   case Mips::fixup_Mips_64:
   case FK_Data_8:
-    Type = ELF::R_MIPS_64;
-    break;
+    return ELF::R_MIPS_64;
   case FK_GPRel_4:
     if (isN64()) {
+      unsigned Type = (unsigned)ELF::R_MIPS_NONE;
       Type = setRType((unsigned)ELF::R_MIPS_GPREL32, Type);
       Type = setRType2((unsigned)ELF::R_MIPS_64, Type);
       Type = setRType3((unsigned)ELF::R_MIPS_NONE, Type);
+      return Type;
     }
-    else
-      Type = ELF::R_MIPS_GPREL32;
-    break;
+    return ELF::R_MIPS_GPREL32;
   case Mips::fixup_Mips_GPREL16:
-    Type = ELF::R_MIPS_GPREL16;
-    break;
+    return ELF::R_MIPS_GPREL16;
   case Mips::fixup_Mips_26:
-    Type = ELF::R_MIPS_26;
-    break;
+    return ELF::R_MIPS_26;
   case Mips::fixup_Mips_CALL16:
-    Type = ELF::R_MIPS_CALL16;
-    break;
+    return ELF::R_MIPS_CALL16;
   case Mips::fixup_Mips_GOT_Global:
   case Mips::fixup_Mips_GOT_Local:
-    Type = ELF::R_MIPS_GOT16;
-    break;
+    return ELF::R_MIPS_GOT16;
   case Mips::fixup_Mips_HI16:
-    Type = ELF::R_MIPS_HI16;
-    break;
+    return ELF::R_MIPS_HI16;
   case Mips::fixup_Mips_LO16:
-    Type = ELF::R_MIPS_LO16;
-    break;
+    return ELF::R_MIPS_LO16;
   case Mips::fixup_Mips_TLSGD:
-    Type = ELF::R_MIPS_TLS_GD;
-    break;
+    return ELF::R_MIPS_TLS_GD;
   case Mips::fixup_Mips_GOTTPREL:
-    Type = ELF::R_MIPS_TLS_GOTTPREL;
-    break;
+    return ELF::R_MIPS_TLS_GOTTPREL;
   case Mips::fixup_Mips_TPREL_HI:
-    Type = ELF::R_MIPS_TLS_TPREL_HI16;
-    break;
+    return ELF::R_MIPS_TLS_TPREL_HI16;
   case Mips::fixup_Mips_TPREL_LO:
-    Type = ELF::R_MIPS_TLS_TPREL_LO16;
-    break;
+    return ELF::R_MIPS_TLS_TPREL_LO16;
   case Mips::fixup_Mips_TLSLDM:
-    Type = ELF::R_MIPS_TLS_LDM;
-    break;
+    return ELF::R_MIPS_TLS_LDM;
   case Mips::fixup_Mips_DTPREL_HI:
-    Type = ELF::R_MIPS_TLS_DTPREL_HI16;
-    break;
+    return ELF::R_MIPS_TLS_DTPREL_HI16;
   case Mips::fixup_Mips_DTPREL_LO:
-    Type = ELF::R_MIPS_TLS_DTPREL_LO16;
-    break;
+    return ELF::R_MIPS_TLS_DTPREL_LO16;
   case Mips::fixup_Mips_Branch_PCRel:
   case Mips::fixup_Mips_PC16:
-    Type = ELF::R_MIPS_PC16;
-    break;
+    return ELF::R_MIPS_PC16;
   case Mips::fixup_Mips_GOT_PAGE:
-    Type = ELF::R_MIPS_GOT_PAGE;
-    break;
+    return ELF::R_MIPS_GOT_PAGE;
   case Mips::fixup_Mips_GOT_OFST:
-    Type = ELF::R_MIPS_GOT_OFST;
-    break;
+    return ELF::R_MIPS_GOT_OFST;
   case Mips::fixup_Mips_GOT_DISP:
-    Type = ELF::R_MIPS_GOT_DISP;
-    break;
-  case Mips::fixup_Mips_GPOFF_HI:
+    return ELF::R_MIPS_GOT_DISP;
+  case Mips::fixup_Mips_GPOFF_HI: {
+    unsigned Type = (unsigned)ELF::R_MIPS_NONE;
     Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type);
     Type = setRType2((unsigned)ELF::R_MIPS_SUB, Type);
     Type = setRType3((unsigned)ELF::R_MIPS_HI16, Type);
-    break;
-  case Mips::fixup_Mips_GPOFF_LO:
+    return Type;
+  }
+  case Mips::fixup_Mips_GPOFF_LO: {
+    unsigned Type = (unsigned)ELF::R_MIPS_NONE;
     Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type);
     Type = setRType2((unsigned)ELF::R_MIPS_SUB, Type);
     Type = setRType3((unsigned)ELF::R_MIPS_LO16, Type);
-    break;
+    return Type;
+  }
   case Mips::fixup_Mips_HIGHER:
-    Type = ELF::R_MIPS_HIGHER;
-    break;
+    return ELF::R_MIPS_HIGHER;
   case Mips::fixup_Mips_HIGHEST:
-    Type = ELF::R_MIPS_HIGHEST;
-    break;
+    return ELF::R_MIPS_HIGHEST;
   case Mips::fixup_Mips_GOT_HI16:
-    Type = ELF::R_MIPS_GOT_HI16;
-    break;
+    return ELF::R_MIPS_GOT_HI16;
   case Mips::fixup_Mips_GOT_LO16:
-    Type = ELF::R_MIPS_GOT_LO16;
-    break;
+    return ELF::R_MIPS_GOT_LO16;
   case Mips::fixup_Mips_CALL_HI16:
-    Type = ELF::R_MIPS_CALL_HI16;
-    break;
+    return ELF::R_MIPS_CALL_HI16;
   case Mips::fixup_Mips_CALL_LO16:
-    Type = ELF::R_MIPS_CALL_LO16;
-    break;
+    return ELF::R_MIPS_CALL_LO16;
   case Mips::fixup_MICROMIPS_26_S1:
-    Type = ELF::R_MICROMIPS_26_S1;
-    break;
+    return ELF::R_MICROMIPS_26_S1;
   case Mips::fixup_MICROMIPS_HI16:
-    Type = ELF::R_MICROMIPS_HI16;
-    break;
+    return ELF::R_MICROMIPS_HI16;
   case Mips::fixup_MICROMIPS_LO16:
-    Type = ELF::R_MICROMIPS_LO16;
-    break;
+    return ELF::R_MICROMIPS_LO16;
   case Mips::fixup_MICROMIPS_GOT16:
-    Type = ELF::R_MICROMIPS_GOT16;
-    break;
+    return ELF::R_MICROMIPS_GOT16;
   case Mips::fixup_MICROMIPS_PC7_S1:
-    Type = ELF::R_MICROMIPS_PC7_S1;
-    break;
+    return ELF::R_MICROMIPS_PC7_S1;
   case Mips::fixup_MICROMIPS_PC10_S1:
-    Type = ELF::R_MICROMIPS_PC10_S1;
-    break;
+    return ELF::R_MICROMIPS_PC10_S1;
   case Mips::fixup_MICROMIPS_PC16_S1:
-    Type = ELF::R_MICROMIPS_PC16_S1;
-    break;
+    return ELF::R_MICROMIPS_PC16_S1;
   case Mips::fixup_MICROMIPS_CALL16:
-    Type = ELF::R_MICROMIPS_CALL16;
-    break;
+    return ELF::R_MICROMIPS_CALL16;
   case Mips::fixup_MICROMIPS_GOT_DISP:
-    Type = ELF::R_MICROMIPS_GOT_DISP;
-    break;
+    return ELF::R_MICROMIPS_GOT_DISP;
   case Mips::fixup_MICROMIPS_GOT_PAGE:
-    Type = ELF::R_MICROMIPS_GOT_PAGE;
-    break;
+    return ELF::R_MICROMIPS_GOT_PAGE;
   case Mips::fixup_MICROMIPS_GOT_OFST:
-    Type = ELF::R_MICROMIPS_GOT_OFST;
-    break;
+    return ELF::R_MICROMIPS_GOT_OFST;
   case Mips::fixup_MICROMIPS_TLS_GD:
-    Type = ELF::R_MICROMIPS_TLS_GD;
-    break;
+    return ELF::R_MICROMIPS_TLS_GD;
   case Mips::fixup_MICROMIPS_TLS_LDM:
-    Type = ELF::R_MICROMIPS_TLS_LDM;
-    break;
+    return ELF::R_MICROMIPS_TLS_LDM;
   case Mips::fixup_MICROMIPS_TLS_DTPREL_HI16:
-    Type = ELF::R_MICROMIPS_TLS_DTPREL_HI16;
-    break;
+    return ELF::R_MICROMIPS_TLS_DTPREL_HI16;
   case Mips::fixup_MICROMIPS_TLS_DTPREL_LO16:
-    Type = ELF::R_MICROMIPS_TLS_DTPREL_LO16;
-    break;
+    return ELF::R_MICROMIPS_TLS_DTPREL_LO16;
   case Mips::fixup_MICROMIPS_TLS_TPREL_HI16:
-    Type = ELF::R_MICROMIPS_TLS_TPREL_HI16;
-    break;
+    return ELF::R_MICROMIPS_TLS_TPREL_HI16;
   case Mips::fixup_MICROMIPS_TLS_TPREL_LO16:
-    Type = ELF::R_MICROMIPS_TLS_TPREL_LO16;
-    break;
+    return ELF::R_MICROMIPS_TLS_TPREL_LO16;
   case Mips::fixup_MIPS_PC19_S2:
-    Type = ELF::R_MIPS_PC19_S2;
-    break;
+    return ELF::R_MIPS_PC19_S2;
   case Mips::fixup_MIPS_PC18_S3:
-    Type = ELF::R_MIPS_PC18_S3;
-    break;
+    return ELF::R_MIPS_PC18_S3;
   case Mips::fixup_MIPS_PC21_S2:
-    Type = ELF::R_MIPS_PC21_S2;
-    break;
+    return ELF::R_MIPS_PC21_S2;
   case Mips::fixup_MIPS_PC26_S2:
-    Type = ELF::R_MIPS_PC26_S2;
-    break;
+    return ELF::R_MIPS_PC26_S2;
   case Mips::fixup_MIPS_PCHI16:
-    Type = ELF::R_MIPS_PCHI16;
-    break;
+    return ELF::R_MIPS_PCHI16;
   case Mips::fixup_MIPS_PCLO16:
-    Type = ELF::R_MIPS_PCLO16;
-    break;
+    return ELF::R_MIPS_PCLO16;
   }
-  return Type;
+  llvm_unreachable("invalid fixup kind!");
 }
 
 // Sort entries by SortOffset in descending order.
@@ -271,9 +224,7 @@ static unsigned getMatchingLoType(const MCAssembler &Asm,
   if (Type == ELF::R_MIPS16_HI16)
     return ELF::R_MIPS16_LO16;
 
-  const MCSymbolData &SD = Asm.getSymbolData(*Reloc.Symbol);
-
-  if (MCELF::GetBinding(SD) != ELF::STB_LOCAL)
+  if (Reloc.Symbol->getBinding() != ELF::STB_LOCAL)
     return ELF::R_MIPS_NONE;
 
   if (Type == ELF::R_MIPS_GOT16)
@@ -405,9 +356,8 @@ void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm,
     Relocs[I] = MipsRelocs[I].R;
 }
 
-bool
-MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbolData &SD,
-                                             unsigned Type) const {
+bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
+                                                  unsigned Type) const {
   // FIXME: This is extremely conservative. This really needs to use a
   // whitelist with a clear explanation for why each realocation needs to
   // point to the symbol, not to the section.
@@ -434,7 +384,7 @@ MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbolData &SD,
     return true;
 
   case ELF::R_MIPS_32:
-    if (MCELF::getOther(SD) & (ELF::STO_MIPS_MICROMIPS >> 2))
+    if (cast<MCSymbolELF>(Sym).getOther() & ELF::STO_MIPS_MICROMIPS)
       return true;
     // falltrough
   case ELF::R_MIPS_26:
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index d2b51831245a..b45d9cf621d7 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -9,8 +9,8 @@
 
 #include "MipsELFStreamer.h"
 #include "MipsTargetStreamer.h"
-#include "llvm/MC/MCELF.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/Support/ELF.h"
 
 using namespace llvm;
@@ -41,12 +41,10 @@ void MipsELFStreamer::createPendingLabelRelocs() {
 
   // FIXME: Also mark labels when in MIPS16 mode.
   if (ELFTargetStreamer->isMicroMipsEnabled()) {
-    for (auto Label : Labels) {
-      MCSymbolData &Data = getOrCreateSymbolData(Label);
-      // The "other" values are stored in the last 6 bits of the second byte.
-      // The traditional defines for STO values assume the full byte and thus
-      // the shift to pack it.
-      MCELF::setOther(Data, ELF::STO_MIPS_MICROMIPS >> 2);
+    for (auto *L : Labels) {
+      auto *Label = cast<MCSymbolELF>(L);
+      getAssembler().registerSymbol(*Label);
+      Label->setOther(ELF::STO_MIPS_MICROMIPS);
     }
   }
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index e2bd5a815ab1..4d554583dc78 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -18,8 +18,7 @@ using namespace llvm;
 
 void MipsMCAsmInfo::anchor() { }
 
-MipsMCAsmInfo::MipsMCAsmInfo(StringRef TT) {
-  Triple TheTriple(TT);
+MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) {
   if ((TheTriple.getArch() == Triple::mips) ||
       (TheTriple.getArch() == Triple::mips64))
     IsLittleEndian = false;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index 59ff1c41ed6e..5d23fcbd7a44 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -17,12 +17,12 @@
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
-  class StringRef;
+  class Triple;
 
   class MipsMCAsmInfo : public MCAsmInfoELF {
     void anchor() override;
   public:
-    explicit MipsMCAsmInfo(StringRef TT);
+    explicit MipsMCAsmInfo(const Triple &TheTriple);
   };
 
 } // namespace llvm
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index a0d9e1540515..93925bf8ca03 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -467,7 +467,7 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
                const MCSubtargetInfo &STI) const {
   int64_t Res;
 
-  if (Expr->EvaluateAsAbsolute(Res))
+  if (Expr->evaluateAsAbsolute(Res))
     return Res;
 
   MCExpr::ExprKind Kind = Expr->getKind();
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index 74490f334b37..c85fc4816b08 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -41,7 +41,7 @@ bool MipsMCExpr::isSupportedBinaryExpr(MCSymbolRefExpr::VariantKind VK,
 }
 
 const MipsMCExpr*
-MipsMCExpr::Create(MCSymbolRefExpr::VariantKind VK, const MCExpr *Expr,
+MipsMCExpr::create(MCSymbolRefExpr::VariantKind VK, const MCExpr *Expr,
                    MCContext &Ctx) {
   VariantKind Kind;
   switch (VK) {
@@ -64,7 +64,7 @@ MipsMCExpr::Create(MCSymbolRefExpr::VariantKind VK, const MCExpr *Expr,
   return new (Ctx) MipsMCExpr(Kind, Expr);
 }
 
-void MipsMCExpr::PrintImpl(raw_ostream &OS) const {
+void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   switch (Kind) {
   default: llvm_unreachable("Invalid kind!");
   case VK_Mips_LO: OS << "%lo"; break;
@@ -74,15 +74,15 @@ void MipsMCExpr::PrintImpl(raw_ostream &OS) const {
   }
 
   OS << '(';
-  Expr->print(OS);
+  Expr->print(OS, MAI);
   OS << ')';
 }
 
 bool
-MipsMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+MipsMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
                                       const MCAsmLayout *Layout,
                                       const MCFixup *Fixup) const {
-  return getSubExpr()->EvaluateAsRelocatable(Res, Layout, Fixup);
+  return getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup);
 }
 
 void MipsMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
index ee11461ef174..fd2ed17ee785 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -37,7 +37,7 @@ public:
   static bool isSupportedBinaryExpr(MCSymbolRefExpr::VariantKind VK,
                                     const MCBinaryExpr *BE);
 
-  static const MipsMCExpr *Create(MCSymbolRefExpr::VariantKind VK,
+  static const MipsMCExpr *create(MCSymbolRefExpr::VariantKind VK,
                                   const MCExpr *Expr, MCContext &Ctx);
 
   /// getOpcode - Get the kind of this expression.
@@ -46,13 +46,13 @@ public:
   /// getSubExpr - Get the child of this expression.
   const MCExpr *getSubExpr() const { return Expr; }
 
-  void PrintImpl(raw_ostream &OS) const override;
-  bool EvaluateAsRelocatableImpl(MCValue &Res,
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAsmLayout *Layout,
                                  const MCFixup *Fixup) const override;
   void visitUsedExpr(MCStreamer &Streamer) const override;
-  MCSection *FindAssociatedSection() const override {
-    return getSubExpr()->FindAssociatedSection();
+  MCSection *findAssociatedSection() const override {
+    return getSubExpr()->findAssociatedSection();
   }
 
   // There are no TLS MipsMCExprs at the moment.
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index 2e3179ac28d9..54d88632abdb 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -75,7 +75,8 @@ static MCSubtargetInfo *createMipsMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
+static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI,
+                                      const Triple &TT) {
   MCAsmInfo *MAI = new MipsMCAsmInfo(TT);
 
   unsigned SP = MRI.getDwarfRegNum(Mips::SP, true);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 8e6f047450e3..a051f4c123fc 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -17,10 +17,9 @@
 #include "MipsTargetObjectFile.h"
 #include "MipsTargetStreamer.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCELF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -81,6 +80,12 @@ void MipsTargetStreamer::emitDirectiveSetMips64R5() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetMips64R6() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetPop() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetPush() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetSoftFloat() {
+  forbidModuleDirective();
+}
+void MipsTargetStreamer::emitDirectiveSetHardFloat() {
+  forbidModuleDirective();
+}
 void MipsTargetStreamer::emitDirectiveSetDsp() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetNoDsp() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveCpLoad(unsigned RegNo) {}
@@ -308,6 +313,16 @@ void MipsTargetAsmStreamer::emitDirectiveSetPush() {
  MipsTargetStreamer::emitDirectiveSetPush();
 }
 
+void MipsTargetAsmStreamer::emitDirectiveSetSoftFloat() {
+  OS << "\t.set\tsoftfloat\n";
+  MipsTargetStreamer::emitDirectiveSetSoftFloat();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetHardFloat() {
+  OS << "\t.set\thardfloat\n";
+  MipsTargetStreamer::emitDirectiveSetHardFloat();
+}
+
 // Print a 32 bit hex number with all numbers.
 static void printHex32(unsigned Value, raw_ostream &OS) {
   OS << "0x";
@@ -358,7 +373,6 @@ void MipsTargetAsmStreamer::emitDirectiveModuleFP(
     MipsABIFlagsSection::FpABIKind Value, bool Is32BitABI) {
   MipsTargetStreamer::emitDirectiveModuleFP(Value, Is32BitABI);
 
-  StringRef ModuleValue;
   OS << "\t.module\tfp=";
   OS << ABIFlagsSection.getFpABIString(Value) << "\n";
 }
@@ -367,7 +381,6 @@ void MipsTargetAsmStreamer::emitDirectiveSetFp(
     MipsABIFlagsSection::FpABIKind Value) {
   MipsTargetStreamer::emitDirectiveSetFp(Value);
 
-  StringRef ModuleValue;
   OS << "\t.set\tfp=";
   OS << ABIFlagsSection.getFpABIString(Value) << "\n";
 }
@@ -440,18 +453,16 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
   MCA.setELFHeaderEFlags(EFlags);
 }
 
-void MipsTargetELFStreamer::emitLabel(MCSymbol *Symbol) {
+void MipsTargetELFStreamer::emitLabel(MCSymbol *S) {
+  auto *Symbol = cast<MCSymbolELF>(S);
   if (!isMicroMipsEnabled())
     return;
-  MCSymbolData &Data = getStreamer().getOrCreateSymbolData(Symbol);
-  uint8_t Type = MCELF::GetType(Data);
+  getStreamer().getAssembler().registerSymbol(*Symbol);
+  uint8_t Type = Symbol->getType();
   if (Type != ELF::STT_FUNC)
     return;
 
-  // The "other" values are stored in the last 6 bits of the second byte
-  // The traditional defines for STO values assume the full byte and thus
-  // the shift to pack it.
-  MCELF::setOther(Data, ELF::STO_MIPS_MICROMIPS >> 2);
+  Symbol->setOther(ELF::STO_MIPS_MICROMIPS);
 }
 
 void MipsTargetELFStreamer::finish() {
@@ -505,23 +516,18 @@ void MipsTargetELFStreamer::finish() {
   emitMipsAbiFlags();
 }
 
-void MipsTargetELFStreamer::emitAssignment(MCSymbol *Symbol,
-                                           const MCExpr *Value) {
+void MipsTargetELFStreamer::emitAssignment(MCSymbol *S, const MCExpr *Value) {
+  auto *Symbol = cast<MCSymbolELF>(S);
   // If on rhs is micromips symbol then mark Symbol as microMips.
   if (Value->getKind() != MCExpr::SymbolRef)
     return;
-  const MCSymbol &RhsSym =
-      static_cast<const MCSymbolRefExpr *>(Value)->getSymbol();
-  MCSymbolData &Data = getStreamer().getOrCreateSymbolData(&RhsSym);
+  const auto &RhsSym = cast<MCSymbolELF>(
+      static_cast<const MCSymbolRefExpr *>(Value)->getSymbol());
 
-  if (!(MCELF::getOther(Data) & (ELF::STO_MIPS_MICROMIPS >> 2)))
+  if (!(RhsSym.getOther() & ELF::STO_MIPS_MICROMIPS))
     return;
 
-  MCSymbolData &SymbolData = getStreamer().getOrCreateSymbolData(Symbol);
-  // The "other" values are stored in the last 6 bits of the second byte.
-  // The traditional defines for STO values assume the full byte and thus
-  // the shift to pack it.
-  MCELF::setOther(SymbolData, ELF::STO_MIPS_MICROMIPS >> 2);
+  Symbol->setOther(ELF::STO_MIPS_MICROMIPS);
 }
 
 MCELFStreamer &MipsTargetELFStreamer::getStreamer() {
@@ -568,7 +574,7 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
                                             ELF::SHF_ALLOC | ELF::SHT_REL);
 
   const MCSymbolRefExpr *ExprRef =
-      MCSymbolRefExpr::Create(Name, MCSymbolRefExpr::VK_None, Context);
+      MCSymbolRefExpr::create(Name, MCSymbolRefExpr::VK_None, Context);
 
   MCA.registerSection(*Sec);
   Sec->setAlignment(4);
@@ -693,12 +699,12 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   StringRef SymName("_gp_disp");
   MCAssembler &MCA = getStreamer().getAssembler();
   MCSymbol *GP_Disp = MCA.getContext().getOrCreateSymbol(SymName);
-  MCA.getOrCreateSymbolData(*GP_Disp);
+  MCA.registerSymbol(*GP_Disp);
 
   MCInst TmpInst;
   TmpInst.setOpcode(Mips::LUi);
   TmpInst.addOperand(MCOperand::createReg(Mips::GP));
-  const MCSymbolRefExpr *HiSym = MCSymbolRefExpr::Create(
+  const MCSymbolRefExpr *HiSym = MCSymbolRefExpr::create(
       "_gp_disp", MCSymbolRefExpr::VK_Mips_ABS_HI, MCA.getContext());
   TmpInst.addOperand(MCOperand::createExpr(HiSym));
   getStreamer().EmitInstruction(TmpInst, STI);
@@ -708,7 +714,7 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   TmpInst.setOpcode(Mips::ADDiu);
   TmpInst.addOperand(MCOperand::createReg(Mips::GP));
   TmpInst.addOperand(MCOperand::createReg(Mips::GP));
-  const MCSymbolRefExpr *LoSym = MCSymbolRefExpr::Create(
+  const MCSymbolRefExpr *LoSym = MCSymbolRefExpr::create(
       "_gp_disp", MCSymbolRefExpr::VK_Mips_ABS_LO, MCA.getContext());
   TmpInst.addOperand(MCOperand::createExpr(LoSym));
   getStreamer().EmitInstruction(TmpInst, STI);
@@ -752,9 +758,9 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
   getStreamer().EmitInstruction(Inst, STI);
   Inst.clear();
 
-  const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::Create(
+  const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::create(
       &Sym, MCSymbolRefExpr::VK_Mips_GPOFF_HI, MCA.getContext());
-  const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::Create(
+  const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::create(
       &Sym, MCSymbolRefExpr::VK_Mips_GPOFF_LO, MCA.getContext());
 
   // lui $gp, %hi(%neg(%gp_rel(funcSym)))
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 272933f7285e..8a27874a37ce 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -305,8 +305,9 @@ let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
 def LONG_BRANCH_DADDiu : PseudoSE<(outs GPR64Opnd:$dst),
   (ins GPR64Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>;
 
-// Cavium Octeon cmMIPS instructions
-let EncodingPredicates = []<Predicate>, // FIXME: The lack of HasStdEnc is probably a bug
+// Cavium Octeon cnMIPS instructions
+let DecoderNamespace = "CnMips",
+    EncodingPredicates = []<Predicate>, // FIXME: The lack of HasStdEnc is probably a bug
     AdditionalPredicates = [HasCnMips] in {
 
 class Count1s<string opstr, RegisterOperand RO>:
@@ -353,6 +354,10 @@ class CBranchBitNum<string opstr, DAGOperand opnd, PatFrag cond_op,
   let Defs = [AT];
 }
 
+class MFC2OP<string asmstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt, uimm16:$imm16), (ins),
+         !strconcat(asmstr, "\t$rt, $imm16"), [], NoItinerary, FrmFR>;
+
 // Unsigned Byte Add
 let Pattern = [(set GPR64Opnd:$rd,
                     (and (add GPR64Opnd:$rs, GPR64Opnd:$rt), 255))] in
@@ -415,6 +420,9 @@ let Defs = [MPL1, MPL2, P0, P1, P2] in
 def VMULU : ArithLogicR<"vmulu", GPR64Opnd, 0, II_DMUL>,
                                  ADD_FM<0x1c, 0x0f>;
 
+// Move between CPU and coprocessor registers
+def DMFC2_OCTEON : MFC2OP<"dmfc2", GPR64Opnd>, MFC2OP_FM<0x12, 1>;
+def DMTC2_OCTEON : MFC2OP<"dmtc2", GPR64Opnd>, MFC2OP_FM<0x12, 5>;
 }
 
 }
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index a3995b8ceb99..f84666b6229e 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -41,7 +41,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -594,11 +594,11 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
       break;
 
     case MachineOperand::MO_MachineBasicBlock:
-      O << *MO.getMBB()->getSymbol();
+      MO.getMBB()->getSymbol()->print(O, MAI);
       return;
 
     case MachineOperand::MO_GlobalAddress:
-      O << *getSymbol(MO.getGlobal());
+      getSymbol(MO.getGlobal())->print(O, MAI);
       break;
 
     case MachineOperand::MO_BlockAddress: {
@@ -778,7 +778,7 @@ void MipsAsmPrinter::EmitJal(const MCSubtargetInfo &STI, MCSymbol *Symbol) {
   MCInst I;
   I.setOpcode(Mips::JAL);
   I.addOperand(
-      MCOperand::createExpr(MCSymbolRefExpr::Create(Symbol, OutContext)));
+      MCOperand::createExpr(MCSymbolRefExpr::create(Symbol, OutContext)));
   OutStreamer->EmitInstruction(I, STI);
 }
 
@@ -983,7 +983,8 @@ void MipsAsmPrinter::EmitFPCallStub(
   //  __call_stub_fp_xxxx:
   //
   std::string x = "__call_stub_fp_" + std::string(Symbol);
-  MCSymbol *Stub = OutContext.getOrCreateSymbol(StringRef(x));
+  MCSymbolELF *Stub =
+      cast<MCSymbolELF>(OutContext.getOrCreateSymbol(StringRef(x)));
   TS.emitDirectiveEnt(*Stub);
   MCSymbol *MType =
       OutContext.getOrCreateSymbol("__call_stub_fp_" + Twine(Symbol));
@@ -1028,10 +1029,10 @@ void MipsAsmPrinter::EmitFPCallStub(
 
   MCSymbol *Tmp = OutContext.createTempSymbol();
   OutStreamer->EmitLabel(Tmp);
-  const MCSymbolRefExpr *E = MCSymbolRefExpr::Create(Stub, OutContext);
-  const MCSymbolRefExpr *T = MCSymbolRefExpr::Create(Tmp, OutContext);
-  const MCExpr *T_min_E = MCBinaryExpr::CreateSub(T, E, OutContext);
-  OutStreamer->EmitELFSize(Stub, T_min_E);
+  const MCSymbolRefExpr *E = MCSymbolRefExpr::create(Stub, OutContext);
+  const MCSymbolRefExpr *T = MCSymbolRefExpr::create(Tmp, OutContext);
+  const MCExpr *T_min_E = MCBinaryExpr::createSub(T, E, OutContext);
+  OutStreamer->emitELFSize(Stub, T_min_E);
   TS.emitDirectiveEnd(x);
   OutStreamer->PopSection();
 }
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index 4faee10744b3..3d020abe2704 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -82,6 +82,7 @@ class MipsFastISel final : public FastISel {
   LLVMContext *Context;
 
   bool fastLowerCall(CallLoweringInfo &CLI) override;
+  bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
 
   bool TargetSupported;
   bool UnsupportedFPMode; // To allow fast-isel to proceed and just not handle
@@ -94,6 +95,7 @@ private:
   bool selectLoad(const Instruction *I);
   bool selectStore(const Instruction *I);
   bool selectBranch(const Instruction *I);
+  bool selectSelect(const Instruction *I);
   bool selectCmp(const Instruction *I);
   bool selectFPExt(const Instruction *I);
   bool selectFPTrunc(const Instruction *I);
@@ -102,6 +104,7 @@ private:
   bool selectTrunc(const Instruction *I);
   bool selectIntExt(const Instruction *I);
   bool selectShift(const Instruction *I);
+  bool selectDivRem(const Instruction *I, unsigned ISDOpcode);
 
   // Utility helper routines.
   bool isTypeLegal(Type *Ty, MVT &VT);
@@ -140,6 +143,7 @@ private:
   unsigned materializeGV(const GlobalValue *GV, MVT VT);
   unsigned materializeInt(const Constant *C, MVT VT);
   unsigned materialize32BitInt(int64_t Imm, const TargetRegisterClass *RC);
+  unsigned materializeExternalCallSym(const char *SynName);
 
   MachineInstrBuilder emitInst(unsigned Opc) {
     return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
@@ -156,6 +160,12 @@ private:
                                    unsigned MemReg, int64_t MemOffset) {
     return emitInst(Opc, DstReg).addReg(MemReg).addImm(MemOffset);
   }
+
+  unsigned fastEmitInst_rr(unsigned MachineInstOpcode,
+                           const TargetRegisterClass *RC,
+                           unsigned Op0, bool Op0IsKill,
+                           unsigned Op1, bool Op1IsKill);
+
   // for some reason, this default is not generated by tablegen
   // so we explicitly generate it here.
   //
@@ -359,6 +369,15 @@ unsigned MipsFastISel::materializeGV(const GlobalValue *GV, MVT VT) {
   return DestReg;
 }
 
+unsigned MipsFastISel::materializeExternalCallSym(const char *SymName) {
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  unsigned DestReg = createResultReg(RC);
+  emitInst(Mips::LW, DestReg)
+      .addReg(MFI->getGlobalBaseReg())
+      .addExternalSymbol(SymName, MipsII::MO_GOT);
+  return DestReg;
+}
+
 // Materialize a constant into a register, and return the register
 // number (or zero if we failed to handle it).
 unsigned MipsFastISel::fastMaterializeConstant(const Constant *C) {
@@ -463,15 +482,51 @@ bool MipsFastISel::computeAddress(const Value *Obj, Address &Addr) {
 }
 
 bool MipsFastISel::computeCallAddress(const Value *V, Address &Addr) {
-  const GlobalValue *GV = dyn_cast<GlobalValue>(V);
-  if (GV && isa<Function>(GV) && cast<Function>(GV)->isIntrinsic())
-    return false;
-  if (!GV)
-    return false;
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+
+  if (const auto *I = dyn_cast<Instruction>(V)) {
+    // Check if the value is defined in the same basic block. This information
+    // is crucial to know whether or not folding an operand is valid.
+    if (I->getParent() == FuncInfo.MBB->getBasicBlock()) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const auto *C = dyn_cast<ConstantExpr>(V)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  switch (Opcode) {
+  default:
+    break;
+  case Instruction::BitCast:
+    // Look past bitcasts if its operand is in the same BB.
+      return computeCallAddress(U->getOperand(0), Addr);
+    break;
+  case Instruction::IntToPtr:
+    // Look past no-op inttoptrs if its operand is in the same BB.
+    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+      return computeCallAddress(U->getOperand(0), Addr);
+    break;
+  case Instruction::PtrToInt:
+    // Look past no-op ptrtoints if its operand is in the same BB.
+    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+      return computeCallAddress(U->getOperand(0), Addr);
+    break;
+  }
+
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
     Addr.setGlobalValue(GV);
     return true;
   }
+
+  // If all else fails, try to materialize the value in a register.
+  if (!Addr.getGlobalValue()) {
+    Addr.setReg(getRegForValue(V));
+    return Addr.getReg() != 0;
+  }
+
   return false;
 }
 
@@ -893,6 +948,50 @@ bool MipsFastISel::selectFPExt(const Instruction *I) {
   return true;
 }
 
+bool MipsFastISel::selectSelect(const Instruction *I) {
+  assert(isa<SelectInst>(I) && "Expected a select instruction.");
+
+  MVT VT;
+  if (!isTypeSupported(I->getType(), VT))
+    return false;
+
+  unsigned CondMovOpc;
+  const TargetRegisterClass *RC;
+
+  if (VT.isInteger() && !VT.isVector() && VT.getSizeInBits() <= 32) {
+    CondMovOpc = Mips::MOVN_I_I;
+    RC = &Mips::GPR32RegClass;
+  } else if (VT == MVT::f32) {
+    CondMovOpc = Mips::MOVN_I_S;
+    RC = &Mips::FGR32RegClass;
+  } else if (VT == MVT::f64) {
+    CondMovOpc = Mips::MOVN_I_D32;
+    RC = &Mips::AFGR64RegClass;
+  } else
+    return false;
+
+  const SelectInst *SI = cast<SelectInst>(I);
+  const Value *Cond = SI->getCondition();
+  unsigned Src1Reg = getRegForValue(SI->getTrueValue());
+  unsigned Src2Reg = getRegForValue(SI->getFalseValue());
+  unsigned CondReg = getRegForValue(Cond);
+
+  if (!Src1Reg || !Src2Reg || !CondReg)
+    return false;
+
+  unsigned ResultReg = createResultReg(RC);
+  unsigned TempReg = createResultReg(RC);
+
+  if (!ResultReg || !TempReg)
+    return false;
+
+  emitInst(TargetOpcode::COPY, TempReg).addReg(Src2Reg);
+  emitInst(CondMovOpc, ResultReg)
+    .addReg(Src1Reg).addReg(CondReg).addReg(TempReg);
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
 // Attempt to fast-select a floating-point truncate instruction.
 bool MipsFastISel::selectFPTrunc(const Instruction *I) {
   if (UnsupportedFPMode)
@@ -1135,7 +1234,7 @@ bool MipsFastISel::fastLowerCall(CallLoweringInfo &CLI) {
   bool IsTailCall = CLI.IsTailCall;
   bool IsVarArg = CLI.IsVarArg;
   const Value *Callee = CLI.Callee;
-  // const char *SymName = CLI.SymName;
+  const char *SymName = CLI.SymName;
 
   // Allow SelectionDAG isel to handle tail calls.
   if (IsTailCall)
@@ -1182,8 +1281,15 @@ bool MipsFastISel::fastLowerCall(CallLoweringInfo &CLI) {
   if (!processCallArgs(CLI, OutVTs, NumBytes))
     return false;
 
+  if (!Addr.getGlobalValue())
+    return false;
+
   // Issue the call.
-  unsigned DestAddress = materializeGV(Addr.getGlobalValue(), MVT::i32);
+  unsigned DestAddress;
+  if (SymName)
+    DestAddress = materializeExternalCallSym(SymName);
+  else
+    DestAddress = materializeGV(Addr.getGlobalValue(), MVT::i32);
   emitInst(TargetOpcode::COPY, Mips::T9).addReg(DestAddress);
   MachineInstrBuilder MIB =
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::JALR),
@@ -1203,6 +1309,98 @@ bool MipsFastISel::fastLowerCall(CallLoweringInfo &CLI) {
   return finishCall(CLI, RetVT, NumBytes);
 }
 
+bool MipsFastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
+  switch (II->getIntrinsicID()) {
+  default:
+    return false;
+  case Intrinsic::bswap: {
+    Type *RetTy = II->getCalledFunction()->getReturnType();
+
+    MVT VT;
+    if (!isTypeSupported(RetTy, VT))
+      return false;
+
+    unsigned SrcReg = getRegForValue(II->getOperand(0));
+    if (SrcReg == 0)
+      return false;
+    unsigned DestReg = createResultReg(&Mips::GPR32RegClass);
+    if (DestReg == 0)
+      return false;
+    if (VT == MVT::i16) {
+      if (Subtarget->hasMips32r2()) {
+        emitInst(Mips::WSBH, DestReg).addReg(SrcReg);
+        updateValueMap(II, DestReg);
+        return true;
+      } else {
+        unsigned TempReg[3];
+        for (int i = 0; i < 3; i++) {
+          TempReg[i] = createResultReg(&Mips::GPR32RegClass);
+          if (TempReg[i] == 0)
+            return false;
+        }
+        emitInst(Mips::SLL, TempReg[0]).addReg(SrcReg).addImm(8);
+        emitInst(Mips::SRL, TempReg[1]).addReg(SrcReg).addImm(8);
+        emitInst(Mips::OR, TempReg[2]).addReg(TempReg[0]).addReg(TempReg[1]);
+        emitInst(Mips::ANDi, DestReg).addReg(TempReg[2]).addImm(0xFFFF);
+        updateValueMap(II, DestReg);
+        return true;
+      }
+    } else if (VT == MVT::i32) {
+      if (Subtarget->hasMips32r2()) {
+        unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+        emitInst(Mips::WSBH, TempReg).addReg(SrcReg);
+        emitInst(Mips::ROTR, DestReg).addReg(TempReg).addImm(16);
+        updateValueMap(II, DestReg);
+        return true;
+      } else {
+        unsigned TempReg[8];
+        for (int i = 0; i < 8; i++) {
+          TempReg[i] = createResultReg(&Mips::GPR32RegClass);
+          if (TempReg[i] == 0)
+            return false;
+        }
+
+        emitInst(Mips::SRL, TempReg[0]).addReg(SrcReg).addImm(8);
+        emitInst(Mips::SRL, TempReg[1]).addReg(SrcReg).addImm(24);
+        emitInst(Mips::ANDi, TempReg[2]).addReg(TempReg[0]).addImm(0xFF00);
+        emitInst(Mips::OR, TempReg[3]).addReg(TempReg[1]).addReg(TempReg[2]);
+
+        emitInst(Mips::ANDi, TempReg[4]).addReg(SrcReg).addImm(0xFF00);
+        emitInst(Mips::SLL, TempReg[5]).addReg(TempReg[4]).addImm(8);
+
+        emitInst(Mips::SLL, TempReg[6]).addReg(SrcReg).addImm(24);
+        emitInst(Mips::OR, TempReg[7]).addReg(TempReg[3]).addReg(TempReg[5]);
+        emitInst(Mips::OR, DestReg).addReg(TempReg[6]).addReg(TempReg[7]);
+        updateValueMap(II, DestReg);
+        return true;
+      }
+    }
+    return false;
+  }
+  case Intrinsic::memcpy:
+  case Intrinsic::memmove: {
+    const auto *MTI = cast<MemTransferInst>(II);
+    // Don't handle volatile.
+    if (MTI->isVolatile())
+      return false;
+    if (!MTI->getLength()->getType()->isIntegerTy(32))
+      return false;
+    const char *IntrMemName = isa<MemCpyInst>(II) ? "memcpy" : "memmove";
+    return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 2);
+  }
+  case Intrinsic::memset: {
+    const MemSetInst *MSI = cast<MemSetInst>(II);
+    // Don't handle volatile.
+    if (MSI->isVolatile())
+      return false;
+    if (!MSI->getLength()->getType()->isIntegerTy(32))
+      return false;
+    return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
+  }
+  }
+  return false;
+}
+
 bool MipsFastISel::selectRet(const Instruction *I) {
   const Function &F = *I->getParent()->getParent();
   const ReturnInst *Ret = cast<ReturnInst>(I);
@@ -1420,6 +1618,50 @@ unsigned MipsFastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
   return Success ? DestReg : 0;
 }
 
+bool MipsFastISel::selectDivRem(const Instruction *I, unsigned ISDOpcode) {
+  EVT DestEVT = TLI.getValueType(I->getType(), true);
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT DestVT = DestEVT.getSimpleVT();
+  if (DestVT != MVT::i32)
+    return false;
+
+  unsigned DivOpc;
+  switch (ISDOpcode) {
+  default:
+    return false;
+  case ISD::SDIV:
+  case ISD::SREM:
+    DivOpc = Mips::SDIV;
+    break;
+  case ISD::UDIV:
+  case ISD::UREM:
+    DivOpc = Mips::UDIV;
+    break;
+  }
+
+  unsigned Src0Reg = getRegForValue(I->getOperand(0));
+  unsigned Src1Reg = getRegForValue(I->getOperand(1));
+  if (!Src0Reg || !Src1Reg)
+    return false;
+
+  emitInst(DivOpc).addReg(Src0Reg).addReg(Src1Reg);
+  emitInst(Mips::TEQ).addReg(Src1Reg).addReg(Mips::ZERO).addImm(7);
+
+  unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+  if (!ResultReg)
+    return false;
+
+  unsigned MFOpc = (ISDOpcode == ISD::SREM || ISDOpcode == ISD::UREM)
+                       ? Mips::MFHI
+                       : Mips::MFLO;
+  emitInst(MFOpc, ResultReg);
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
 bool MipsFastISel::selectShift(const Instruction *I) {
   MVT RetVT;
 
@@ -1505,6 +1747,22 @@ bool MipsFastISel::fastSelectInstruction(const Instruction *I) {
     return selectLoad(I);
   case Instruction::Store:
     return selectStore(I);
+  case Instruction::SDiv:
+    if (!selectBinaryOp(I, ISD::SDIV))
+      return selectDivRem(I, ISD::SDIV);
+    return true;
+  case Instruction::UDiv:
+    if (!selectBinaryOp(I, ISD::UDIV))
+      return selectDivRem(I, ISD::UDIV);
+    return true;
+  case Instruction::SRem:
+    if (!selectBinaryOp(I, ISD::SREM))
+      return selectDivRem(I, ISD::SREM);
+    return true;
+  case Instruction::URem:
+    if (!selectBinaryOp(I, ISD::UREM))
+      return selectDivRem(I, ISD::UREM);
+    return true;
   case Instruction::Shl:
   case Instruction::LShr:
   case Instruction::AShr:
@@ -1533,6 +1791,8 @@ bool MipsFastISel::fastSelectInstruction(const Instruction *I) {
   case Instruction::ICmp:
   case Instruction::FCmp:
     return selectCmp(I);
+  case Instruction::Select:
+    return selectSelect(I);
   }
   return false;
 }
@@ -1563,6 +1823,33 @@ void MipsFastISel::simplifyAddress(Address &Addr) {
   }
 }
 
+unsigned MipsFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
+                                       const TargetRegisterClass *RC,
+                                       unsigned Op0, bool Op0IsKill,
+                                       unsigned Op1, bool Op1IsKill) {
+  // We treat the MUL instruction in a special way because it clobbers
+  // the HI0 & LO0 registers. The TableGen definition of this instruction can
+  // mark these registers only as implicitly defined. As a result, the
+  // register allocator runs out of registers when this instruction is
+  // followed by another instruction that defines the same registers too.
+  // We can fix this by explicitly marking those registers as dead.
+  if (MachineInstOpcode == Mips::MUL) {
+    unsigned ResultReg = createResultReg(RC);
+    const MCInstrDesc &II = TII.get(MachineInstOpcode);
+    Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+    Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+      .addReg(Op0, getKillRegState(Op0IsKill))
+      .addReg(Op1, getKillRegState(Op1IsKill))
+      .addReg(Mips::HI0, RegState::ImplicitDefine | RegState::Dead)
+      .addReg(Mips::LO0, RegState::ImplicitDefine | RegState::Dead);
+    return ResultReg;
+  }
+
+  return FastISel::fastEmitInst_rr(MachineInstOpcode, RC, Op0, Op0IsKill, Op1,
+                                   Op1IsKill);
+}
+
 namespace llvm {
 FastISel *Mips::createFastISel(FunctionLoweringInfo &funcInfo,
                                const TargetLibraryInfo *libInfo) {
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index 826fbaf4d00a..a74c8abd2e2d 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -90,12 +90,23 @@ const MipsFrameLowering *MipsFrameLowering::create(const MipsSubtarget &ST) {
 }
 
 // hasFP - Return true if the specified function should have a dedicated frame
-// pointer register.  This is true if the function has variable sized allocas or
-// if frame pointer elimination is disabled.
+// pointer register.  This is true if the function has variable sized allocas,
+// if it needs dynamic stack realignment, if frame pointer elimination is
+// disabled, or if the frame address is taken.
 bool MipsFrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+
   return MF.getTarget().Options.DisableFramePointerElim(MF) ||
-      MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
+      MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() ||
+      TRI->needsStackRealignment(MF);
+}
+
+bool MipsFrameLowering::hasBP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+
+  return MFI->hasVarSizedObjects() && TRI->needsStackRealignment(MF);
 }
 
 uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const {
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index 0b5183081e09..5eabd58e8686 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -32,6 +32,8 @@ public:
 
   bool hasFP(const MachineFunction &MF) const override;
 
+  bool hasBP(const MachineFunction &MF) const;
+
   bool isFPCloseToIncomingSP() const override { return false; }
 
   void
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 6c7f0895b426..67ddcc4dacb9 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -3547,7 +3547,8 @@ void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 }
 
 bool MipsTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                               Type *Ty) const {
+                                               Type *Ty,
+                                               unsigned AS) const {
   // No global is ever allowed as a base.
   if (AM.BaseGV)
     return false;
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 6ea14b53a57f..bc9a1ce64097 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -514,7 +514,8 @@ namespace llvm {
       return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
     }
 
-    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
+                               unsigned AS) const override;
 
     bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 02ecf32d3e47..5f4fcc354616 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -226,6 +226,18 @@ class MFC3OP_FM<bits<6> op, bits<5> mfmt>
   let Inst{2-0}   = sel;
 }
 
+class MFC2OP_FM<bits<6> op, bits<5> mfmt> : StdArch {
+  bits<5>  rt;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = mfmt;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
 class ADD_FM<bits<6> op, bits<6> funct> : StdArch {
   bits<5> rd;
   bits<5> rs;
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index 9e611804376b..6b2a44d7a893 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -101,7 +101,7 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
     llvm_unreachable("<unknown operand type>");
   }
 
-  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::Create(Symbol, Kind, *Ctx);
+  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Symbol, Kind, *Ctx);
 
   if (!Offset)
     return MCOperand::createExpr(MCSym);
@@ -109,8 +109,8 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   // Assume offset is never negative.
   assert(Offset > 0);
 
-  const MCConstantExpr *OffsetExpr =  MCConstantExpr::Create(Offset, *Ctx);
-  const MCBinaryExpr *Add = MCBinaryExpr::CreateAdd(MCSym, OffsetExpr, *Ctx);
+  const MCConstantExpr *OffsetExpr =  MCConstantExpr::create(Offset, *Ctx);
+  const MCBinaryExpr *Add = MCBinaryExpr::createAdd(MCSym, OffsetExpr, *Ctx);
   return MCOperand::createExpr(Add);
 }
 
@@ -155,11 +155,11 @@ MCOperand MipsMCInstLower::LowerOperand(const MachineOperand &MO,
 MCOperand MipsMCInstLower::createSub(MachineBasicBlock *BB1,
                                      MachineBasicBlock *BB2,
                                      MCSymbolRefExpr::VariantKind Kind) const {
-  const MCSymbolRefExpr *Sym1 = MCSymbolRefExpr::Create(BB1->getSymbol(), *Ctx);
-  const MCSymbolRefExpr *Sym2 = MCSymbolRefExpr::Create(BB2->getSymbol(), *Ctx);
-  const MCBinaryExpr *Sub = MCBinaryExpr::CreateSub(Sym1, Sym2, *Ctx);
+  const MCSymbolRefExpr *Sym1 = MCSymbolRefExpr::create(BB1->getSymbol(), *Ctx);
+  const MCSymbolRefExpr *Sym2 = MCSymbolRefExpr::create(BB2->getSymbol(), *Ctx);
+  const MCBinaryExpr *Sub = MCBinaryExpr::createSub(Sym1, Sym2, *Ctx);
 
-  return MCOperand::createExpr(MipsMCExpr::Create(Kind, Sub, *Ctx));
+  return MCOperand::createExpr(MipsMCExpr::create(Kind, Sub, *Ctx));
 }
 
 void MipsMCInstLower::
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index f72fb4d622ec..f6647e6a8468 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/IR/Constants.h"
@@ -178,6 +179,15 @@ getReservedRegs(const MachineFunction &MF) const {
     else {
       Reserved.set(Mips::FP);
       Reserved.set(Mips::FP_64);
+
+      // Reserve the base register if we need to both realign the stack and
+      // allocate variable-sized objects at runtime. This should test the
+      // same conditions as MipsFrameLowering::hasBP().
+      if (needsStackRealignment(MF) &&
+          MF.getFrameInfo()->hasVarSizedObjects()) {
+        Reserved.set(Mips::S7);
+        Reserved.set(Mips::S7_64);
+      }
     }
   }
 
@@ -271,6 +281,67 @@ getFrameRegister(const MachineFunction &MF) const {
   else
     return TFI->hasFP(MF) ? (IsN64 ? Mips::FP_64 : Mips::FP) :
                             (IsN64 ? Mips::SP_64 : Mips::SP);
+}
 
+bool MipsRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>();
+  unsigned FP = Subtarget.isGP32bit() ? Mips::FP : Mips::FP_64;
+  unsigned BP = Subtarget.isGP32bit() ? Mips::S7 : Mips::S7_64;
+
+  // Support dynamic stack realignment only for targets with standard encoding.
+  if (!Subtarget.hasStandardEncoding())
+    return false;
+
+  // We can't perform dynamic stack realignment if we can't reserve the
+  // frame pointer register.
+  if (!MF.getRegInfo().canReserveReg(FP))
+    return false;
+
+  // We can realign the stack if we know the maximum call frame size and we
+  // don't have variable sized objects.
+  if (Subtarget.getFrameLowering()->hasReservedCallFrame(MF))
+    return true;
+
+  // We have to reserve the base pointer register in the presence of variable
+  // sized objects.
+  return MF.getRegInfo().canReserveReg(BP);
 }
 
+bool MipsRegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
+  const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  bool CanRealign = canRealignStack(MF);
+
+  // Avoid realigning functions that explicitly do not want to be realigned.
+  // Normally, we should report an error when a function should be dynamically
+  // realigned but also has the attribute no-realign-stack. Unfortunately,
+  // with this attribute, MachineFrameInfo clamps each new object's alignment
+  // to that of the stack's alignment as specified by the ABI. As a result,
+  // the information of whether we have objects with larger alignment
+  // requirement than the stack's alignment is already lost at this point.
+  if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
+    return false;
+
+  const Function *F = MF.getFunction();
+  if (F->hasFnAttribute(Attribute::StackAlignment)) {
+#ifdef DEBUG
+    if (!CanRealign)
+      DEBUG(dbgs() << "It's not possible to realign the stack of the function: "
+            << F->getName() << "\n");
+#endif
+    return CanRealign;
+  }
+
+  unsigned StackAlignment = Subtarget.getFrameLowering()->getStackAlignment();
+  if (MFI->getMaxAlignment() > StackAlignment) {
+#ifdef DEBUG
+    if (!CanRealign)
+      DEBUG(dbgs() << "It's not possible to realign the stack of the function: "
+            << F->getName() << "\n");
+#endif
+    return CanRealign;
+  }
+
+  return false;
+}
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 76e84bd142b9..ee1f6bcd7390 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -57,6 +57,14 @@ public:
                            int SPAdj, unsigned FIOperandNum,
                            RegScavenger *RS = nullptr) const override;
 
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                       RegScavenger *RS = nullptr) const;
+
+  // Stack realignment queries.
+  bool canRealignStack(const MachineFunction &MF) const;
+
+  bool needsStackRealignment(const MachineFunction &MF) const override;
+
   /// Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const override;
 
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index 19efa59e1fdf..ec7bf314c641 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -382,6 +382,11 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
   unsigned FP = ABI.GetFramePtr();
   unsigned ZERO = ABI.GetNullPtr();
   unsigned ADDu = ABI.GetPtrAdduOp();
+  unsigned ADDiu = ABI.GetPtrAddiuOp();
+  unsigned AND = ABI.IsN64() ? Mips::AND64 : Mips::AND;
+
+  const TargetRegisterClass *RC = ABI.ArePtrs64bit() ?
+        &Mips::GPR64RegClass : &Mips::GPR32RegClass;
 
   // First, compute final stack size.
   uint64_t StackSize = MFI->getStackSize();
@@ -464,15 +469,12 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   if (MipsFI->callsEhReturn()) {
-    const TargetRegisterClass *PtrRC =
-        ABI.ArePtrs64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
-
     // Insert instructions that spill eh data registers.
     for (int I = 0; I < 4; ++I) {
       if (!MBB.isLiveIn(ABI.GetEhDataReg(I)))
         MBB.addLiveIn(ABI.GetEhDataReg(I));
       TII.storeRegToStackSlot(MBB, MBBI, ABI.GetEhDataReg(I), false,
-                              MipsFI->getEhDataRegFI(I), PtrRC, &RegInfo);
+                              MipsFI->getEhDataRegFI(I), RC, &RegInfo);
     }
 
     // Emit .cfi_offset directives for eh data registers.
@@ -497,6 +499,26 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
         nullptr, MRI->getDwarfRegNum(FP, true)));
     BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex);
+
+    if (RegInfo.needsStackRealignment(MF)) {
+      // addiu $Reg, $zero, -MaxAlignment
+      // andi $sp, $sp, $Reg
+      unsigned VR = MF.getRegInfo().createVirtualRegister(RC);
+      assert(isInt<16>(MFI->getMaxAlignment()) &&
+             "Function's alignment size requirement is not supported.");
+      int MaxAlign = - (signed) MFI->getMaxAlignment();
+
+      BuildMI(MBB, MBBI, dl, TII.get(ADDiu), VR).addReg(ZERO) .addImm(MaxAlign);
+      BuildMI(MBB, MBBI, dl, TII.get(AND), SP).addReg(SP).addReg(VR);
+
+      if (hasBP(MF)) {
+        // move $s7, $sp
+        unsigned BP = STI.isABI_N64() ? Mips::S7_64 : Mips::S7;
+        BuildMI(MBB, MBBI, dl, TII.get(ADDu), BP)
+          .addReg(SP)
+          .addReg(ZERO);
+      }
+    }
   }
 }
 
@@ -606,10 +628,14 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
   MipsABIInfo ABI = STI.getABI();
   unsigned FP = ABI.GetFramePtr();
+  unsigned BP = ABI.IsN64() ? Mips::S7_64 : Mips::S7;
 
   // Mark $fp as used if function has dedicated frame pointer.
   if (hasFP(MF))
     MRI.setPhysRegUsed(FP);
+  // Mark $s7 as used if function has dedicated base pointer.
+  if (hasBP(MF))
+    MRI.setPhysRegUsed(BP);
 
   // Create spill slots for eh data registers if function calls eh_return.
   if (MipsFI->callsEhReturn())
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index 8c74a98ecca6..132c3a1001ad 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -110,8 +110,11 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
   MachineFunction &MF = *MI.getParent()->getParent();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
   MipsABIInfo ABI =
       static_cast<const MipsTargetMachine &>(MF.getTarget()).getABI();
+  const MipsRegisterInfo *RegInfo =
+    static_cast<const MipsRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
 
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   int MinCSFI = 0;
@@ -135,7 +138,14 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
 
   if ((FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI) || EhDataRegFI)
     FrameReg = ABI.GetStackPtr();
-  else
+  else if (RegInfo->needsStackRealignment(MF)) {
+    if (MFI->hasVarSizedObjects() && !MFI->isFixedObjectIndex(FrameIndex))
+      FrameReg = ABI.GetBasePtr();
+    else if (MFI->isFixedObjectIndex(FrameIndex))
+      FrameReg = getFrameRegister(MF);
+    else
+      FrameReg = ABI.GetStackPtr();
+  } else
     FrameReg = getFrameRegister(MF);
 
   // Calculate final offset.
diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h
index 22b0c6c6685d..fed06005e9c8 100644
--- a/lib/Target/Mips/MipsTargetStreamer.h
+++ b/lib/Target/Mips/MipsTargetStreamer.h
@@ -72,6 +72,8 @@ public:
   virtual void emitDirectiveSetNoDsp();
   virtual void emitDirectiveSetPop();
   virtual void emitDirectiveSetPush();
+  virtual void emitDirectiveSetSoftFloat();
+  virtual void emitDirectiveSetHardFloat();
 
   // PIC support
   virtual void emitDirectiveCpLoad(unsigned RegNo);
@@ -188,6 +190,8 @@ public:
   void emitDirectiveSetNoDsp() override;
   void emitDirectiveSetPop() override;
   void emitDirectiveSetPush() override;
+  void emitDirectiveSetSoftFloat() override;
+  void emitDirectiveSetHardFloat() override;
 
   // PIC support
   void emitDirectiveCpLoad(unsigned RegNo) override;
diff --git a/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp b/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
index 3615c146a527..6a65943515bb 100644
--- a/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
+++ b/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
@@ -23,9 +23,9 @@ extern "C" void LLVMInitializeMipsTargetInfo() {
         /*HasJIT=*/true> Y(TheMipselTarget, "mipsel", "Mipsel");
 
   RegisterTarget<Triple::mips64,
-        /*HasJIT=*/false> A(TheMips64Target, "mips64", "Mips64 [experimental]");
+        /*HasJIT=*/true> A(TheMips64Target, "mips64", "Mips64 [experimental]");
 
   RegisterTarget<Triple::mips64el,
-        /*HasJIT=*/false> B(TheMips64elTarget,
+        /*HasJIT=*/true> B(TheMips64elTarget,
                             "mips64el", "Mips64el [experimental]");
 }
diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt
index cdd2f1f5944f..d48a7a9b1fcc 100644
--- a/lib/Target/NVPTX/CMakeLists.txt
+++ b/lib/Target/NVPTX/CMakeLists.txt
@@ -20,7 +20,7 @@ set(NVPTXCodeGen_sources
   NVPTXImageOptimizer.cpp
   NVPTXInstrInfo.cpp
   NVPTXLowerAggrCopies.cpp
-  NVPTXLowerStructArgs.cpp
+  NVPTXLowerKernelArgs.cpp
   NVPTXMCExpr.cpp
   NVPTXPrologEpilogPass.cpp
   NVPTXRegisterInfo.cpp
diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
index ac92df901243..4594c22b8701 100644
--- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
@@ -85,7 +85,7 @@ void NVPTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     O << markup("<imm:") << formatImm(Op.getImm()) << markup(">");
   } else {
     assert(Op.isExpr() && "Unknown operand kind in printOperand");
-    O << *Op.getExpr();
+    Op.getExpr()->print(O, &MAI);
   }
 }
 
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index b9df3d18f941..ef36c13b49f1 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -25,8 +25,7 @@ static cl::opt<bool> CompileForDebugging("debug-compile",
 
 void NVPTXMCAsmInfo::anchor() {}
 
-NVPTXMCAsmInfo::NVPTXMCAsmInfo(StringRef TT) {
-  Triple TheTriple(TT);
+NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple) {
   if (TheTriple.getArch() == Triple::nvptx64) {
     PointerSize = CalleeSaveStackSlotSize = 8;
   }
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
index c3242866b177..b432e065c2f4 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -18,12 +18,12 @@
 
 namespace llvm {
 class Target;
-class StringRef;
+class Triple;
 
 class NVPTXMCAsmInfo : public MCAsmInfo {
   virtual void anchor();
 public:
-  explicit NVPTXMCAsmInfo(StringRef TT);
+  explicit NVPTXMCAsmInfo(const Triple &TheTriple);
 };
 } // namespace llvm
 
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index 382525d27a25..477b0bac6ca8 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -69,7 +69,7 @@ ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping);
 MachineFunctionPass *createNVPTXPrologEpilogPass();
 MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
 FunctionPass *createNVPTXImageOptimizerPass();
-FunctionPass *createNVPTXLowerStructArgsPass();
+FunctionPass *createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM);
 
 bool isImageOrSamplerVal(const Value *, const Module *);
 
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 3bbea400e53e..298b992b241f 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -266,7 +266,7 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
     MCOp = MCOperand::createImm(MO.getImm());
     break;
   case MachineOperand::MO_MachineBasicBlock:
-    MCOp = MCOperand::createExpr(MCSymbolRefExpr::Create(
+    MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
         MO.getMBB()->getSymbol(), OutContext));
     break;
   case MachineOperand::MO_ExternalSymbol:
@@ -283,11 +283,11 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
     default: report_fatal_error("Unsupported FP type"); break;
     case Type::FloatTyID:
       MCOp = MCOperand::createExpr(
-        NVPTXFloatMCExpr::CreateConstantFPSingle(Val, OutContext));
+        NVPTXFloatMCExpr::createConstantFPSingle(Val, OutContext));
       break;
     case Type::DoubleTyID:
       MCOp = MCOperand::createExpr(
-        NVPTXFloatMCExpr::CreateConstantFPDouble(Val, OutContext));
+        NVPTXFloatMCExpr::createConstantFPDouble(Val, OutContext));
       break;
     }
     break;
@@ -334,7 +334,7 @@ unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
 
 MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) {
   const MCExpr *Expr;
-  Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
+  Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None,
                                  OutContext);
   return MCOperand::createExpr(Expr);
 }
@@ -418,9 +418,8 @@ void NVPTXAsmPrinter::printReturnValStr(const MachineFunction &MF,
 bool NVPTXAsmPrinter::isLoopHeaderOfNoUnroll(
     const MachineBasicBlock &MBB) const {
   MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
-  // TODO: isLoopHeader() should take "const MachineBasicBlock *".
   // We insert .pragma "nounroll" only to the loop header.
-  if (!LI.isLoopHeader(const_cast<MachineBasicBlock *>(&MBB)))
+  if (!LI.isLoopHeader(&MBB))
     return false;
 
   // llvm.loop.unroll.disable is marked on the back edges of a loop. Therefore,
@@ -468,7 +467,7 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
     printReturnValStr(*MF, O);
   }
 
-  O << *CurrentFnSym;
+  CurrentFnSym->print(O, MAI);
 
   emitFunctionParamList(*MF, O);
 
@@ -625,7 +624,8 @@ void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
   else
     O << ".func ";
   printReturnValStr(F, O);
-  O << *getSymbol(F) << "\n";
+  getSymbol(F)->print(O, MAI);
+  O << "\n";
   emitFunctionParamList(F, O);
   O << ";\n";
 }
@@ -1172,7 +1172,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
     else
       O << getPTXFundamentalTypeStr(ETy, false);
     O << " ";
-    O << *getSymbol(GVar);
+    getSymbol(GVar)->print(O, MAI);
 
     // Ptx allows variable initilization only for constant and global state
     // spaces.
@@ -1189,11 +1189,9 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
         // The frontend adds zero-initializer to variables that don't have an
         // initial value, so skip warning for this case.
         if (!GVar->getInitializer()->isNullValue()) {
-          std::string warnMsg =
-              ("initial value of '" + GVar->getName() +
-               "' is not allowed in addrspace(" +
-               Twine(llvm::utostr_32(PTy->getAddressSpace())) + ")").str();
-          report_fatal_error(warnMsg.c_str());
+          report_fatal_error("initial value of '" + GVar->getName() +
+                             "' is not allowed in addrspace(" +
+                             Twine(PTy->getAddressSpace()) + ")");
         }
       }
     }
@@ -1220,15 +1218,21 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
           bufferAggregateConstant(Initializer, &aggBuffer);
           if (aggBuffer.numSymbols) {
             if (static_cast<const NVPTXTargetMachine &>(TM).is64Bit()) {
-              O << " .u64 " << *getSymbol(GVar) << "[";
+              O << " .u64 ";
+              getSymbol(GVar)->print(O, MAI);
+              O << "[";
               O << ElementSize / 8;
             } else {
-              O << " .u32 " << *getSymbol(GVar) << "[";
+              O << " .u32 ";
+              getSymbol(GVar)->print(O, MAI);
+              O << "[";
               O << ElementSize / 4;
             }
             O << "]";
           } else {
-            O << " .b8 " << *getSymbol(GVar) << "[";
+            O << " .b8 ";
+            getSymbol(GVar)->print(O, MAI);
+            O << "[";
             O << ElementSize;
             O << "]";
           }
@@ -1236,7 +1240,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
           aggBuffer.print();
           O << "}";
         } else {
-          O << " .b8 " << *getSymbol(GVar);
+          O << " .b8 ";
+          getSymbol(GVar)->print(O, MAI);
           if (ElementSize) {
             O << "[";
             O << ElementSize;
@@ -1244,7 +1249,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
           }
         }
       } else {
-        O << " .b8 " << *getSymbol(GVar);
+        O << " .b8 ";
+        getSymbol(GVar)->print(O, MAI);
         if (ElementSize) {
           O << "[";
           O << ElementSize;
@@ -1351,7 +1357,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
     O << " .";
     O << getPTXFundamentalTypeStr(ETy);
     O << " ";
-    O << *getSymbol(GVar);
+    getSymbol(GVar)->print(O, MAI);
     return;
   }
 
@@ -1366,9 +1372,11 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
   case Type::ArrayTyID:
   case Type::VectorTyID:
     ElementSize = TD->getTypeStoreSize(ETy);
-    O << " .b8 " << *getSymbol(GVar) << "[";
+    O << " .b8 ";
+    getSymbol(GVar)->print(O, MAI);
+    O << "[";
     if (ElementSize) {
-      O << itostr(ElementSize);
+      O << ElementSize;
     }
     O << "]";
     break;
@@ -1408,11 +1416,13 @@ static unsigned int getOpenCLAlignment(const DataLayout *TD, Type *Ty) {
 
 void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
                                      int paramIndex, raw_ostream &O) {
-  O << *getSymbol(I->getParent()) << "_param_" << paramIndex;
+  getSymbol(I->getParent())->print(O, MAI);
+  O << "_param_" << paramIndex;
 }
 
 void NVPTXAsmPrinter::printParamName(int paramIndex, raw_ostream &O) {
-  O << *CurrentFnSym << "_param_" << paramIndex;
+  CurrentFnSym->print(O, MAI);
+  O << "_param_" << paramIndex;
 }
 
 void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
@@ -1446,21 +1456,24 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
               O << "\t.param .u64 .ptr .surfref ";
             else
               O << "\t.param .surfref ";
-            O << *CurrentFnSym << "_param_" << paramIndex;
+            CurrentFnSym->print(O, MAI);
+            O << "_param_" << paramIndex;
           }
           else { // Default image is read_only
             if (nvptxSubtarget->hasImageHandles())
               O << "\t.param .u64 .ptr .texref ";
             else
               O << "\t.param .texref ";
-            O << *CurrentFnSym << "_param_" << paramIndex;
+            CurrentFnSym->print(O, MAI);
+            O << "_param_" << paramIndex;
           }
         } else {
           if (nvptxSubtarget->hasImageHandles())
             O << "\t.param .u64 .ptr .samplerref ";
           else
             O << "\t.param .samplerref ";
-          O << *CurrentFnSym << "_param_" << paramIndex;
+          CurrentFnSym->print(O, MAI);
+          O << "_param_" << paramIndex;
         }
         continue;
       }
@@ -1716,10 +1729,10 @@ void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
     }
     if (EmitGeneric && !isa<Function>(CPV) && !IsNonGenericPointer) {
       O << "generic(";
-      O << *getSymbol(GVar);
+      getSymbol(GVar)->print(O, MAI);
       O << ")";
     } else {
-      O << *getSymbol(GVar);
+      getSymbol(GVar)->print(O, MAI);
     }
     return;
   }
@@ -1733,20 +1746,44 @@ void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
     if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
       if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) {
         O << "generic(";
-        O << *getSymbol(GVar);
+        getSymbol(GVar)->print(O, MAI);
         O << ")";
       } else {
-        O << *getSymbol(GVar);
+        getSymbol(GVar)->print(O, MAI);
       }
       return;
     } else {
-      O << *lowerConstant(CPV);
+      lowerConstant(CPV)->print(O, MAI);
       return;
     }
   }
   llvm_unreachable("Not scalar type found in printScalarConstant()");
 }
 
+// These utility functions assure we get the right sequence of bytes for a given
+// type even for big-endian machines
+template <typename T> static void ConvertIntToBytes(unsigned char *p, T val) {
+  int64_t vp = (int64_t)val;
+  for (unsigned i = 0; i < sizeof(T); ++i) {
+    p[i] = (unsigned char)vp;
+    vp >>= 8;
+  }
+}
+static void ConvertFloatToBytes(unsigned char *p, float val) {
+  int32_t *vp = (int32_t *)&val;
+  for (unsigned i = 0; i < sizeof(int32_t); ++i) {
+    p[i] = (unsigned char)*vp;
+    *vp >>= 8;
+  }
+}
+static void ConvertDoubleToBytes(unsigned char *p, double val) {
+  int64_t *vp = (int64_t *)&val;
+  for (unsigned i = 0; i < sizeof(int64_t); ++i) {
+    p[i] = (unsigned char)*vp;
+    *vp >>= 8;
+  }
+}
+
 void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
                                    AggBuffer *aggBuffer) {
 
@@ -1760,30 +1797,30 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
     return;
   }
 
-  unsigned char *ptr;
+  unsigned char ptr[8];
   switch (CPV->getType()->getTypeID()) {
 
   case Type::IntegerTyID: {
     const Type *ETy = CPV->getType();
     if (ETy == Type::getInt8Ty(CPV->getContext())) {
       unsigned char c = (unsigned char)cast<ConstantInt>(CPV)->getZExtValue();
-      ptr = &c;
+      ConvertIntToBytes<>(ptr, c);
       aggBuffer->addBytes(ptr, 1, Bytes);
     } else if (ETy == Type::getInt16Ty(CPV->getContext())) {
       short int16 = (short)cast<ConstantInt>(CPV)->getZExtValue();
-      ptr = (unsigned char *)&int16;
+      ConvertIntToBytes<>(ptr, int16);
       aggBuffer->addBytes(ptr, 2, Bytes);
     } else if (ETy == Type::getInt32Ty(CPV->getContext())) {
       if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
         int int32 = (int)(constInt->getZExtValue());
-        ptr = (unsigned char *)&int32;
+        ConvertIntToBytes<>(ptr, int32);
         aggBuffer->addBytes(ptr, 4, Bytes);
         break;
       } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
         if (const ConstantInt *constInt = dyn_cast<ConstantInt>(
                 ConstantFoldConstantExpression(Cexpr, *TD))) {
           int int32 = (int)(constInt->getZExtValue());
-          ptr = (unsigned char *)&int32;
+          ConvertIntToBytes<>(ptr, int32);
           aggBuffer->addBytes(ptr, 4, Bytes);
           break;
         }
@@ -1798,14 +1835,14 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
     } else if (ETy == Type::getInt64Ty(CPV->getContext())) {
       if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
         long long int64 = (long long)(constInt->getZExtValue());
-        ptr = (unsigned char *)&int64;
+        ConvertIntToBytes<>(ptr, int64);
         aggBuffer->addBytes(ptr, 8, Bytes);
         break;
       } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
         if (const ConstantInt *constInt = dyn_cast<ConstantInt>(
                 ConstantFoldConstantExpression(Cexpr, *TD))) {
           long long int64 = (long long)(constInt->getZExtValue());
-          ptr = (unsigned char *)&int64;
+          ConvertIntToBytes<>(ptr, int64);
           aggBuffer->addBytes(ptr, 8, Bytes);
           break;
         }
@@ -1827,11 +1864,11 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
     const Type *Ty = CFP->getType();
     if (Ty == Type::getFloatTy(CPV->getContext())) {
       float float32 = (float) CFP->getValueAPF().convertToFloat();
-      ptr = (unsigned char *)&float32;
+      ConvertFloatToBytes(ptr, float32);
       aggBuffer->addBytes(ptr, 4, Bytes);
     } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
       double float64 = CFP->getValueAPF().convertToDouble();
-      ptr = (unsigned char *)&float64;
+      ConvertDoubleToBytes(ptr, float64);
       aggBuffer->addBytes(ptr, 8, Bytes);
     } else {
       llvm_unreachable("unsupported fp const type");
@@ -1993,16 +2030,16 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
   MCContext &Ctx = OutContext;
 
   if (CV->isNullValue() || isa<UndefValue>(CV))
-    return MCConstantExpr::Create(0, Ctx);
+    return MCConstantExpr::create(0, Ctx);
 
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV))
-    return MCConstantExpr::Create(CI->getZExtValue(), Ctx);
+    return MCConstantExpr::create(CI->getZExtValue(), Ctx);
 
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV)) {
     const MCSymbolRefExpr *Expr =
-      MCSymbolRefExpr::Create(getSymbol(GV), Ctx);
+      MCSymbolRefExpr::create(getSymbol(GV), Ctx);
     if (ProcessingGeneric) {
-      return NVPTXGenericMCSymbolRefExpr::Create(Expr, Ctx);
+      return NVPTXGenericMCSymbolRefExpr::create(Expr, Ctx);
     } else {
       return Expr;
     }
@@ -2059,7 +2096,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
       return Base;
 
     int64_t Offset = OffsetAI.getSExtValue();
-    return MCBinaryExpr::CreateAdd(Base, MCConstantExpr::Create(Offset, Ctx),
+    return MCBinaryExpr::createAdd(Base, MCConstantExpr::create(Offset, Ctx),
                                    Ctx);
   }
 
@@ -2102,8 +2139,8 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
     // the high bits so we are sure to get a proper truncation if the input is
     // a constant expr.
     unsigned InBits = DL.getTypeAllocSizeInBits(Op->getType());
-    const MCExpr *MaskExpr = MCConstantExpr::Create(~0ULL >> (64-InBits), Ctx);
-    return MCBinaryExpr::CreateAnd(OpExpr, MaskExpr, Ctx);
+    const MCExpr *MaskExpr = MCConstantExpr::create(~0ULL >> (64-InBits), Ctx);
+    return MCBinaryExpr::createAnd(OpExpr, MaskExpr, Ctx);
   }
 
   // The MC library also has a right-shift operator, but it isn't consistently
@@ -2113,7 +2150,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
     const MCExpr *RHS = lowerConstantForGV(CE->getOperand(1), ProcessingGeneric);
     switch (CE->getOpcode()) {
     default: llvm_unreachable("Unknown binary operator constant cast expr");
-    case Instruction::Add: return MCBinaryExpr::CreateAdd(LHS, RHS, Ctx);
+    case Instruction::Add: return MCBinaryExpr::createAdd(LHS, RHS, Ctx);
     }
   }
   }
@@ -2123,7 +2160,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
 void NVPTXAsmPrinter::printMCExpr(const MCExpr &Expr, raw_ostream &OS) {
   switch (Expr.getKind()) {
   case MCExpr::Target:
-    return cast<MCTargetExpr>(&Expr)->PrintImpl(OS);
+    return cast<MCTargetExpr>(&Expr)->printImpl(OS, MAI);
   case MCExpr::Constant:
     OS << cast<MCConstantExpr>(Expr).getValue();
     return;
@@ -2131,7 +2168,7 @@ void NVPTXAsmPrinter::printMCExpr(const MCExpr &Expr, raw_ostream &OS) {
   case MCExpr::SymbolRef: {
     const MCSymbolRefExpr &SRE = cast<MCSymbolRefExpr>(Expr);
     const MCSymbol &Sym = SRE.getSymbol();
-    OS << Sym;
+    Sym.print(OS, MAI);
     return;
   }
 
@@ -2256,11 +2293,11 @@ void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
     break;
 
   case MachineOperand::MO_GlobalAddress:
-    O << *getSymbol(MO.getGlobal());
+    getSymbol(MO.getGlobal())->print(O, MAI);
     break;
 
   case MachineOperand::MO_MachineBasicBlock:
-    O << *MO.getMBB()->getSymbol();
+    MO.getMBB()->getSymbol()->print(O, MAI);
     return;
 
   default:
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 301c68609a29..f6f7685e76f9 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -165,10 +165,10 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
               }
               if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) {
                 O << "generic(";
-                O << *Name;
+                Name->print(O, AP.MAI);
                 O << ")";
               } else {
-                O << *Name;
+                Name->print(O, AP.MAI);
               }
             } else if (const ConstantExpr *CExpr = dyn_cast<ConstantExpr>(v0)) {
               const MCExpr *Expr =
diff --git a/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
index ae63caec1320..cfff0019b8d9 100644
--- a/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
+++ b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
@@ -10,34 +10,54 @@
 // When a load/store accesses the generic address space, checks whether the
 // address is casted from a non-generic address space. If so, remove this
 // addrspacecast because accessing non-generic address spaces is typically
-// faster. Besides seeking addrspacecasts, this optimization also traces into
-// the base pointer of a GEP.
+// faster. Besides removing addrspacecasts directly used by loads/stores, this
+// optimization also recursively traces into a GEP's pointer operand and a
+// bitcast's source to find more eliminable addrspacecasts.
 //
 // For instance, the code below loads a float from an array allocated in
 // addrspace(3).
 //
-// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
-// %1 = gep [10 x float]* %0, i64 0, i64 %i
-// %2 = load float* %1 ; emits ld.f32
+//   %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
+//   %1 = gep [10 x float]* %0, i64 0, i64 %i
+//   %2 = bitcast float* %1 to i32*
+//   %3 = load i32* %2 ; emits ld.u32
 //
-// First, function hoistAddrSpaceCastFromGEP reorders the addrspacecast
-// and the GEP to expose more optimization opportunities to function
+// First, function hoistAddrSpaceCastFrom reorders the addrspacecast, the GEP,
+// and the bitcast to expose more optimization opportunities to function
 // optimizeMemoryInst. The intermediate code looks like:
 //
-// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
-// %1 = addrspacecast float addrspace(3)* %0 to float*
-// %2 = load float* %1 ; still emits ld.f32, but will be optimized shortly
+//   %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+//   %1 = bitcast float addrspace(3)* %0 to i32 addrspace(3)*
+//   %2 = addrspacecast i32 addrspace(3)* %1 to i32*
+//   %3 = load i32* %2 ; still emits ld.u32, but will be optimized shortly
 //
 // Then, function optimizeMemoryInstruction detects a load from addrspacecast'ed
 // generic pointers, and folds the load and the addrspacecast into a load from
 // the original address space. The final code looks like:
 //
-// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
-// %2 = load float addrspace(3)* %0 ; emits ld.shared.f32
+//   %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+//   %1 = bitcast float addrspace(3)* %0 to i32 addrspace(3)*
+//   %3 = load i32 addrspace(3)* %1 ; emits ld.shared.f32
 //
 // This pass may remove an addrspacecast in a different BB. Therefore, we
 // implement it as a FunctionPass.
 //
+// TODO:
+// The current implementation doesn't handle PHINodes. Eliminating
+// addrspacecasts used by PHINodes is trickier because PHINodes can introduce
+// loops in data flow. For example,
+//
+//     %generic.input = addrspacecast float addrspace(3)* %input to float*
+//   loop:
+//     %y = phi [ %generic.input, %y2 ]
+//     %y2 = getelementptr %y, 1
+//     %v = load %y2
+//     br ..., label %loop, ...
+//
+// Marking %y2 shared depends on marking %y shared, but %y also data-flow
+// depends on %y2. We probably need an iterative fix-point algorithm on handle
+// this case.
+//
 //===----------------------------------------------------------------------===//
 
 #include "NVPTX.h"
@@ -62,17 +82,31 @@ class NVPTXFavorNonGenericAddrSpaces : public FunctionPass {
 public:
   static char ID;
   NVPTXFavorNonGenericAddrSpaces() : FunctionPass(ID) {}
-
   bool runOnFunction(Function &F) override;
 
+private:
   /// Optimizes load/store instructions. Idx is the index of the pointer operand
   /// (0 for load, and 1 for store). Returns true if it changes anything.
   bool optimizeMemoryInstruction(Instruction *I, unsigned Idx);
+  /// Recursively traces into a GEP's pointer operand or a bitcast's source to
+  /// find an eliminable addrspacecast, and hoists that addrspacecast to the
+  /// outermost level. For example, this function transforms
+  ///   bitcast(gep(gep(addrspacecast(X))))
+  /// to
+  ///   addrspacecast(bitcast(gep(gep(X)))).
+  ///
+  /// This reordering exposes to optimizeMemoryInstruction more
+  /// optimization opportunities on loads and stores.
+  ///
+  /// Returns true if this function succesfully hoists an eliminable
+  /// addrspacecast or V is already such an addrspacecast.
   /// Transforms "gep (addrspacecast X), indices" into "addrspacecast (gep X,
-  /// indices)".  This reordering exposes to optimizeMemoryInstruction more
-  /// optimization opportunities on loads and stores. Returns true if it changes
-  /// the program.
-  bool hoistAddrSpaceCastFromGEP(GEPOperator *GEP);
+  /// indices)".
+  bool hoistAddrSpaceCastFrom(Value *V, int Depth = 0);
+  /// Helper function for GEPs.
+  bool hoistAddrSpaceCastFromGEP(GEPOperator *GEP, int Depth);
+  /// Helper function for bitcasts.
+  bool hoistAddrSpaceCastFromBitCast(BitCastOperator *BC, int Depth);
 };
 }
 
@@ -85,11 +119,12 @@ INITIALIZE_PASS(NVPTXFavorNonGenericAddrSpaces, "nvptx-favor-non-generic",
                 "Remove unnecessary non-generic-to-generic addrspacecasts",
                 false, false)
 
-// Decides whether removing Cast is valid and beneficial. Cast can be an
-// instruction or a constant expression.
-static bool IsEliminableAddrSpaceCast(Operator *Cast) {
-  // Returns false if not even an addrspacecast.
-  if (Cast->getOpcode() != Instruction::AddrSpaceCast)
+// Decides whether V is an addrspacecast and shortcutting V in load/store is
+// valid and beneficial.
+static bool isEliminableAddrSpaceCast(Value *V) {
+  // Returns false if V is not even an addrspacecast.
+  Operator *Cast = dyn_cast<Operator>(V);
+  if (Cast == nullptr || Cast->getOpcode() != Instruction::AddrSpaceCast)
     return false;
 
   Value *Src = Cast->getOperand(0);
@@ -108,67 +143,119 @@ static bool IsEliminableAddrSpaceCast(Operator *Cast) {
           DestTy->getAddressSpace() == AddressSpace::ADDRESS_SPACE_GENERIC);
 }
 
-bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP(
-    GEPOperator *GEP) {
-  Operator *Cast = dyn_cast<Operator>(GEP->getPointerOperand());
-  if (!Cast)
+bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP(GEPOperator *GEP,
+                                                               int Depth) {
+  if (!hoistAddrSpaceCastFrom(GEP->getPointerOperand(), Depth + 1))
     return false;
 
-  if (!IsEliminableAddrSpaceCast(Cast))
-    return false;
+  // That hoistAddrSpaceCastFrom succeeds implies GEP's pointer operand is now
+  // an eliminable addrspacecast.
+  assert(isEliminableAddrSpaceCast(GEP->getPointerOperand()));
+  Operator *Cast = cast<Operator>(GEP->getPointerOperand());
 
   SmallVector<Value *, 8> Indices(GEP->idx_begin(), GEP->idx_end());
   if (Instruction *GEPI = dyn_cast<Instruction>(GEP)) {
-    // %1 = gep (addrspacecast X), indices
+    // GEP = gep (addrspacecast X), indices
     // =>
-    // %0 = gep X, indices
-    // %1 = addrspacecast %0
-    GetElementPtrInst *NewGEPI = GetElementPtrInst::Create(
+    // NewGEP = gep X, indices
+    // NewASC = addrspacecast NewGEP
+    GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
         GEP->getSourceElementType(), Cast->getOperand(0), Indices,
-        GEP->getName(), GEPI);
-    NewGEPI->setIsInBounds(GEP->isInBounds());
-    GEP->replaceAllUsesWith(
-        new AddrSpaceCastInst(NewGEPI, GEP->getType(), "", GEPI));
+        "", GEPI);
+    NewGEP->setIsInBounds(GEP->isInBounds());
+    Value *NewASC = new AddrSpaceCastInst(NewGEP, GEP->getType(), "", GEPI);
+    NewASC->takeName(GEP);
+    GEP->replaceAllUsesWith(NewASC);
   } else {
     // GEP is a constant expression.
-    Constant *NewGEPCE = ConstantExpr::getGetElementPtr(
+    Constant *NewGEP = ConstantExpr::getGetElementPtr(
         GEP->getSourceElementType(), cast<Constant>(Cast->getOperand(0)),
         Indices, GEP->isInBounds());
     GEP->replaceAllUsesWith(
-        ConstantExpr::getAddrSpaceCast(NewGEPCE, GEP->getType()));
+        ConstantExpr::getAddrSpaceCast(NewGEP, GEP->getType()));
   }
 
   return true;
 }
 
-bool NVPTXFavorNonGenericAddrSpaces::optimizeMemoryInstruction(Instruction *MI,
-                                                               unsigned Idx) {
-  // If the pointer operand is a GEP, hoist the addrspacecast if any from the
-  // GEP to expose more optimization opportunites.
-  if (GEPOperator *GEP = dyn_cast<GEPOperator>(MI->getOperand(Idx))) {
-    hoistAddrSpaceCastFromGEP(GEP);
-  }
+bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromBitCast(
+    BitCastOperator *BC, int Depth) {
+  if (!hoistAddrSpaceCastFrom(BC->getOperand(0), Depth + 1))
+    return false;
 
-  // load/store (addrspacecast X) => load/store X if shortcutting the
-  // addrspacecast is valid and can improve performance.
-  //
-  // e.g.,
-  // %1 = addrspacecast float addrspace(3)* %0 to float*
-  // %2 = load float* %1
-  // ->
-  // %2 = load float addrspace(3)* %0
-  //
-  // Note: the addrspacecast can also be a constant expression.
-  if (Operator *Cast = dyn_cast<Operator>(MI->getOperand(Idx))) {
-    if (IsEliminableAddrSpaceCast(Cast)) {
-      MI->setOperand(Idx, Cast->getOperand(0));
-      return true;
-    }
+  // That hoistAddrSpaceCastFrom succeeds implies BC's source operand is now
+  // an eliminable addrspacecast.
+  assert(isEliminableAddrSpaceCast(BC->getOperand(0)));
+  Operator *Cast = cast<Operator>(BC->getOperand(0));
+
+  // Cast  = addrspacecast Src
+  // BC    = bitcast Cast
+  //   =>
+  // Cast' = bitcast Src
+  // BC'   = addrspacecast Cast'
+  Value *Src = Cast->getOperand(0);
+  Type *TypeOfNewCast =
+      PointerType::get(BC->getType()->getPointerElementType(),
+                       Src->getType()->getPointerAddressSpace());
+  if (BitCastInst *BCI = dyn_cast<BitCastInst>(BC)) {
+    Value *NewCast = new BitCastInst(Src, TypeOfNewCast, "", BCI);
+    Value *NewBC = new AddrSpaceCastInst(NewCast, BC->getType(), "", BCI);
+    NewBC->takeName(BC);
+    BC->replaceAllUsesWith(NewBC);
+  } else {
+    // BC is a constant expression.
+    Constant *NewCast =
+        ConstantExpr::getBitCast(cast<Constant>(Src), TypeOfNewCast);
+    Constant *NewBC = ConstantExpr::getAddrSpaceCast(NewCast, BC->getType());
+    BC->replaceAllUsesWith(NewBC);
   }
+  return true;
+}
+
+bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFrom(Value *V,
+                                                            int Depth) {
+  // Returns true if V is already an eliminable addrspacecast.
+  if (isEliminableAddrSpaceCast(V))
+    return true;
+
+  // Limit the depth to prevent this recursive function from running too long.
+  const int MaxDepth = 20;
+  if (Depth >= MaxDepth)
+    return false;
+
+  // If V is a GEP or bitcast, hoist the addrspacecast if any from its pointer
+  // operand. This enables optimizeMemoryInstruction to shortcut addrspacecasts
+  // that are not directly used by the load/store.
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(V))
+    return hoistAddrSpaceCastFromGEP(GEP, Depth);
+
+  if (BitCastOperator *BC = dyn_cast<BitCastOperator>(V))
+    return hoistAddrSpaceCastFromBitCast(BC, Depth);
 
   return false;
 }
 
+bool NVPTXFavorNonGenericAddrSpaces::optimizeMemoryInstruction(Instruction *MI,
+                                                               unsigned Idx) {
+  if (hoistAddrSpaceCastFrom(MI->getOperand(Idx))) {
+    // load/store (addrspacecast X) => load/store X if shortcutting the
+    // addrspacecast is valid and can improve performance.
+    //
+    // e.g.,
+    // %1 = addrspacecast float addrspace(3)* %0 to float*
+    // %2 = load float* %1
+    // ->
+    // %2 = load float addrspace(3)* %0
+    //
+    // Note: the addrspacecast can also be a constant expression.
+    assert(isEliminableAddrSpaceCast(MI->getOperand(Idx)));
+    Operator *ASC = dyn_cast<Operator>(MI->getOperand(Idx));
+    MI->setOperand(Idx, ASC->getOperand(0));
+    return true;
+  }
+  return false;
+}
+
 bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) {
   if (DisableFavorNonGeneric)
     return false;
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index fa38a686fcbf..232a611d1760 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -613,6 +613,10 @@ SDNode *NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
       Opc =
           TM.is64Bit() ? NVPTX::cvta_to_local_yes_64 : NVPTX::cvta_to_local_yes;
       break;
+    case ADDRESS_SPACE_PARAM:
+      Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
+                         : NVPTX::nvvm_ptr_gen_to_param;
+      break;
     }
     return CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), Src);
   }
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 805847a581fa..b5af72ab855a 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -3725,7 +3725,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
 /// (CodeGenPrepare.cpp)
 bool NVPTXTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                Type *Ty) const {
+                                                Type *Ty,
+                                                unsigned AS) const {
 
   // AddrMode - This represents an addressing mode of:
   //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index 5142ae3cd88f..ed94775b3002 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -456,7 +456,8 @@ public:
   /// Used to guide target specific optimizations, like loop strength
   /// reduction (LoopStrengthReduce.cpp) and memory optimization for
   /// address mode (CodeGenPrepare.cpp)
-  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
+                             unsigned AS) const override;
 
   /// getFunctionAlignment - Return the Log2 alignment of this function.
   unsigned getFunctionAlignment(const Function *F) const;
@@ -497,12 +498,6 @@ public:
                                     std::vector<SDValue> &Ops,
                                     SelectionDAG &DAG) const override;
 
-  unsigned getInlineAsmMemConstraint(
-      const std::string &ConstraintCode) const override {
-    // FIXME: Map different constraints differently.
-    return InlineAsm::Constraint_m;
-  }
-
   const NVPTXTargetMachine *nvTM;
 
   // PTX always uses 32-bit shift amounts
diff --git a/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp b/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
new file mode 100644
index 000000000000..24dcb122b94e
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
@@ -0,0 +1,170 @@
+//===-- NVPTXLowerKernelArgs.cpp - Lower kernel arguments -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Pointer arguments to kernel functions need to be lowered specially.
+//
+// 1. Copy byval struct args to local memory. This is a preparation for handling
+//    cases like
+//
+//    kernel void foo(struct A arg, ...)
+//    {
+//      struct A *p = &arg;
+//      ...
+//      ... = p->filed1 ...  (this is no generic address for .param)
+//      p->filed2 = ...      (this is no write access to .param)
+//    }
+//
+// 2. Convert non-byval pointer arguments of CUDA kernels to pointers in the
+//    global address space. This allows later optimizations to emit
+//    ld.global.*/st.global.* for accessing these pointer arguments. For
+//    example,
+//
+//    define void @foo(float* %input) {
+//      %v = load float, float* %input, align 4
+//      ...
+//    }
+//
+//    becomes
+//
+//    define void @foo(float* %input) {
+//      %input2 = addrspacecast float* %input to float addrspace(1)*
+//      %input3 = addrspacecast float addrspace(1)* %input2 to float*
+//      %v = load float, float* %input3, align 4
+//      ...
+//    }
+//
+//    Later, NVPTXFavorNonGenericAddrSpaces will optimize it to
+//
+//    define void @foo(float* %input) {
+//      %input2 = addrspacecast float* %input to float addrspace(1)*
+//      %v = load float, float addrspace(1)* %input2, align 4
+//      ...
+//    }
+//
+// TODO: merge this pass with NVPTXFavorNonGenericAddrSpace so that other passes
+// don't cancel the addrspacecast pair this pass emits.
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXUtilities.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeNVPTXLowerKernelArgsPass(PassRegistry &);
+}
+
+namespace {
+class NVPTXLowerKernelArgs : public FunctionPass {
+  bool runOnFunction(Function &F) override;
+
+  // handle byval parameters
+  void handleByValParam(Argument *);
+  // handle non-byval pointer parameters
+  void handlePointerParam(Argument *);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  NVPTXLowerKernelArgs(const NVPTXTargetMachine *TM = nullptr)
+      : FunctionPass(ID), TM(TM) {}
+  const char *getPassName() const override {
+    return "Lower pointer arguments of CUDA kernels";
+  }
+
+private:
+  const NVPTXTargetMachine *TM;
+};
+} // namespace
+
+char NVPTXLowerKernelArgs::ID = 1;
+
+INITIALIZE_PASS(NVPTXLowerKernelArgs, "nvptx-lower-kernel-args",
+                "Lower kernel arguments (NVPTX)", false, false)
+
+// =============================================================================
+// If the function had a byval struct ptr arg, say foo(%struct.x *byval %d),
+// then add the following instructions to the first basic block:
+//
+// %temp = alloca %struct.x, align 8
+// %tempd = addrspacecast %struct.x* %d to %struct.x addrspace(101)*
+// %tv = load %struct.x addrspace(101)* %tempd
+// store %struct.x %tv, %struct.x* %temp, align 8
+//
+// The above code allocates some space in the stack and copies the incoming
+// struct from param space to local space.
+// Then replace all occurences of %d by %temp.
+// =============================================================================
+void NVPTXLowerKernelArgs::handleByValParam(Argument *Arg) {
+  Function *Func = Arg->getParent();
+  Instruction *FirstInst = &(Func->getEntryBlock().front());
+  PointerType *PType = dyn_cast<PointerType>(Arg->getType());
+
+  assert(PType && "Expecting pointer type in handleByValParam");
+
+  Type *StructType = PType->getElementType();
+  AllocaInst *AllocA = new AllocaInst(StructType, Arg->getName(), FirstInst);
+  // Set the alignment to alignment of the byval parameter. This is because,
+  // later load/stores assume that alignment, and we are going to replace
+  // the use of the byval parameter with this alloca instruction.
+  AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo() + 1));
+  Arg->replaceAllUsesWith(AllocA);
+
+  Value *ArgInParam = new AddrSpaceCastInst(
+      Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
+      FirstInst);
+  LoadInst *LI = new LoadInst(ArgInParam, Arg->getName(), FirstInst);
+  new StoreInst(LI, AllocA, FirstInst);
+}
+
+void NVPTXLowerKernelArgs::handlePointerParam(Argument *Arg) {
+  assert(!Arg->hasByValAttr() &&
+         "byval params should be handled by handleByValParam");
+
+  Instruction *FirstInst = Arg->getParent()->getEntryBlock().begin();
+  Instruction *ArgInGlobal = new AddrSpaceCastInst(
+      Arg, PointerType::get(Arg->getType()->getPointerElementType(),
+                            ADDRESS_SPACE_GLOBAL),
+      Arg->getName(), FirstInst);
+  Value *ArgInGeneric = new AddrSpaceCastInst(ArgInGlobal, Arg->getType(),
+                                              Arg->getName(), FirstInst);
+  // Replace with ArgInGeneric all uses of Args except ArgInGlobal.
+  Arg->replaceAllUsesWith(ArgInGeneric);
+  ArgInGlobal->setOperand(0, Arg);
+}
+
+
+// =============================================================================
+// Main function for this pass.
+// =============================================================================
+bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
+  // Skip non-kernels. See the comments at the top of this file.
+  if (!isKernelFunction(F))
+    return false;
+
+  for (Argument &Arg : F.args()) {
+    if (Arg.getType()->isPointerTy()) {
+      if (Arg.hasByValAttr())
+        handleByValParam(&Arg);
+      else if (TM && TM->getDrvInterface() == NVPTX::CUDA)
+        handlePointerParam(&Arg);
+    }
+  }
+  return true;
+}
+
+FunctionPass *
+llvm::createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM) {
+  return new NVPTXLowerKernelArgs(TM);
+}
diff --git a/lib/Target/NVPTX/NVPTXLowerStructArgs.cpp b/lib/Target/NVPTX/NVPTXLowerStructArgs.cpp
deleted file mode 100644
index 68dfbb716139..000000000000
--- a/lib/Target/NVPTX/NVPTXLowerStructArgs.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-//===-- NVPTXLowerStructArgs.cpp - Copy struct args to local memory =====--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Copy struct args to local memory. This is needed for kernel functions only.
-// This is a preparation for handling cases like
-//
-// kernel void foo(struct A arg, ...)
-// {
-//     struct A *p = &arg;
-//     ...
-//     ... = p->filed1 ...  (this is no generic address for .param)
-//     p->filed2 = ...      (this is no write access to .param)
-// }
-//
-//===----------------------------------------------------------------------===//
-
-#include "NVPTX.h"
-#include "NVPTXUtilities.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Pass.h"
-
-using namespace llvm;
-
-namespace llvm {
-void initializeNVPTXLowerStructArgsPass(PassRegistry &);
-}
-
-namespace {
-class NVPTXLowerStructArgs : public FunctionPass {
-  bool runOnFunction(Function &F) override;
-
-  void handleStructPtrArgs(Function &);
-  void handleParam(Argument *);
-
-public:
-  static char ID; // Pass identification, replacement for typeid
-  NVPTXLowerStructArgs() : FunctionPass(ID) {}
-  const char *getPassName() const override {
-    return "Copy structure (byval *) arguments to stack";
-  }
-};
-} // namespace
-
-char NVPTXLowerStructArgs::ID = 1;
-
-INITIALIZE_PASS(NVPTXLowerStructArgs, "nvptx-lower-struct-args",
-                "Lower structure arguments (NVPTX)", false, false)
-
-void NVPTXLowerStructArgs::handleParam(Argument *Arg) {
-  Function *Func = Arg->getParent();
-  Instruction *FirstInst = &(Func->getEntryBlock().front());
-  PointerType *PType = dyn_cast<PointerType>(Arg->getType());
-
-  assert(PType && "Expecting pointer type in handleParam");
-
-  Type *StructType = PType->getElementType();
-  AllocaInst *AllocA = new AllocaInst(StructType, Arg->getName(), FirstInst);
-
-  /* Set the alignment to alignment of the byval parameter. This is because,
-   * later load/stores assume that alignment, and we are going to replace
-   * the use of the byval parameter with this alloca instruction.
-   */
-  AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo() + 1));
-
-  Arg->replaceAllUsesWith(AllocA);
-
-  // Get the cvt.gen.to.param intrinsic
-  Type *CvtTypes[] = {
-      Type::getInt8PtrTy(Func->getParent()->getContext(), ADDRESS_SPACE_PARAM),
-      Type::getInt8PtrTy(Func->getParent()->getContext(),
-                         ADDRESS_SPACE_GENERIC)};
-  Function *CvtFunc = Intrinsic::getDeclaration(
-      Func->getParent(), Intrinsic::nvvm_ptr_gen_to_param, CvtTypes);
-
-  Value *BitcastArgs[] = {
-      new BitCastInst(Arg, Type::getInt8PtrTy(Func->getParent()->getContext(),
-                                              ADDRESS_SPACE_GENERIC),
-                      Arg->getName(), FirstInst)};
-  CallInst *CallCVT =
-      CallInst::Create(CvtFunc, BitcastArgs, "cvt_to_param", FirstInst);
-
-  BitCastInst *BitCast = new BitCastInst(
-      CallCVT, PointerType::get(StructType, ADDRESS_SPACE_PARAM),
-      Arg->getName(), FirstInst);
-  LoadInst *LI = new LoadInst(BitCast, Arg->getName(), FirstInst);
-  new StoreInst(LI, AllocA, FirstInst);
-}
-
-// =============================================================================
-// If the function had a struct ptr arg, say foo(%struct.x *byval %d), then
-// add the following instructions to the first basic block :
-//
-// %temp = alloca %struct.x, align 8
-// %tt1 = bitcast %struct.x * %d to i8 *
-// %tt2 = llvm.nvvm.cvt.gen.to.param %tt2
-// %tempd = bitcast i8 addrspace(101) * to %struct.x addrspace(101) *
-// %tv = load %struct.x addrspace(101) * %tempd
-// store %struct.x %tv, %struct.x * %temp, align 8
-//
-// The above code allocates some space in the stack and copies the incoming
-// struct from param space to local space.
-// Then replace all occurences of %d by %temp.
-// =============================================================================
-void NVPTXLowerStructArgs::handleStructPtrArgs(Function &F) {
-  for (Argument &Arg : F.args()) {
-    if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) {
-      handleParam(&Arg);
-    }
-  }
-}
-
-// =============================================================================
-// Main function for this pass.
-// =============================================================================
-bool NVPTXLowerStructArgs::runOnFunction(Function &F) {
-  // Skip non-kernels. See the comments at the top of this file.
-  if (!isKernelFunction(F))
-    return false;
-
-  handleStructPtrArgs(F);
-  return true;
-}
-
-FunctionPass *llvm::createNVPTXLowerStructArgsPass() {
-  return new NVPTXLowerStructArgs();
-}
diff --git a/lib/Target/NVPTX/NVPTXMCExpr.cpp b/lib/Target/NVPTX/NVPTXMCExpr.cpp
index 779b65ecc39f..3c98b9febf85 100644
--- a/lib/Target/NVPTX/NVPTXMCExpr.cpp
+++ b/lib/Target/NVPTX/NVPTXMCExpr.cpp
@@ -16,11 +16,11 @@ using namespace llvm;
 #define DEBUG_TYPE "nvptx-mcexpr"
 
 const NVPTXFloatMCExpr*
-NVPTXFloatMCExpr::Create(VariantKind Kind, APFloat Flt, MCContext &Ctx) {
+NVPTXFloatMCExpr::create(VariantKind Kind, APFloat Flt, MCContext &Ctx) {
   return new (Ctx) NVPTXFloatMCExpr(Kind, Flt);
 }
 
-void NVPTXFloatMCExpr::PrintImpl(raw_ostream &OS) const {
+void NVPTXFloatMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   bool Ignored;
   unsigned NumHex;
   APFloat APF = getAPFloat();
@@ -47,11 +47,14 @@ void NVPTXFloatMCExpr::PrintImpl(raw_ostream &OS) const {
 }
 
 const NVPTXGenericMCSymbolRefExpr*
-NVPTXGenericMCSymbolRefExpr::Create(const MCSymbolRefExpr *SymExpr,
+NVPTXGenericMCSymbolRefExpr::create(const MCSymbolRefExpr *SymExpr,
                                     MCContext &Ctx) {
   return new (Ctx) NVPTXGenericMCSymbolRefExpr(SymExpr);
 }
 
-void NVPTXGenericMCSymbolRefExpr::PrintImpl(raw_ostream &OS) const {
-  OS << "generic(" << *SymExpr << ")";
+void NVPTXGenericMCSymbolRefExpr::printImpl(raw_ostream &OS,
+                                            const MCAsmInfo *MAI) const {
+  OS << "generic(";
+  SymExpr->print(OS, MAI);
+  OS << ")";
 }
diff --git a/lib/Target/NVPTX/NVPTXMCExpr.h b/lib/Target/NVPTX/NVPTXMCExpr.h
index 8c6b219abd13..46b4b33e7e40 100644
--- a/lib/Target/NVPTX/NVPTXMCExpr.h
+++ b/lib/Target/NVPTX/NVPTXMCExpr.h
@@ -36,17 +36,17 @@ public:
   /// @name Construction
   /// @{
 
-  static const NVPTXFloatMCExpr *Create(VariantKind Kind, APFloat Flt,
+  static const NVPTXFloatMCExpr *create(VariantKind Kind, APFloat Flt,
                                         MCContext &Ctx);
 
-  static const NVPTXFloatMCExpr *CreateConstantFPSingle(APFloat Flt,
+  static const NVPTXFloatMCExpr *createConstantFPSingle(APFloat Flt,
                                                         MCContext &Ctx) {
-    return Create(VK_NVPTX_SINGLE_PREC_FLOAT, Flt, Ctx);
+    return create(VK_NVPTX_SINGLE_PREC_FLOAT, Flt, Ctx);
   }
 
-  static const NVPTXFloatMCExpr *CreateConstantFPDouble(APFloat Flt,
+  static const NVPTXFloatMCExpr *createConstantFPDouble(APFloat Flt,
                                                         MCContext &Ctx) {
-    return Create(VK_NVPTX_DOUBLE_PREC_FLOAT, Flt, Ctx);
+    return create(VK_NVPTX_DOUBLE_PREC_FLOAT, Flt, Ctx);
   }
 
   /// @}
@@ -61,14 +61,14 @@ public:
 
 /// @}
 
-  void PrintImpl(raw_ostream &OS) const override;
-  bool EvaluateAsRelocatableImpl(MCValue &Res,
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAsmLayout *Layout,
                                  const MCFixup *Fixup) const override {
     return false;
   }
   void visitUsedExpr(MCStreamer &Streamer) const override {};
-  MCSection *FindAssociatedSection() const override { return nullptr; }
+  MCSection *findAssociatedSection() const override { return nullptr; }
 
   // There are no TLS NVPTXMCExprs at the moment.
   void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
@@ -92,7 +92,7 @@ public:
   /// @{
 
   static const NVPTXGenericMCSymbolRefExpr
-  *Create(const MCSymbolRefExpr *SymExpr, MCContext &Ctx);
+  *create(const MCSymbolRefExpr *SymExpr, MCContext &Ctx);
 
   /// @}
   /// @name Accessors
@@ -103,14 +103,14 @@ public:
 
   /// @}
 
-  void PrintImpl(raw_ostream &OS) const override;
-  bool EvaluateAsRelocatableImpl(MCValue &Res,
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAsmLayout *Layout,
                                  const MCFixup *Fixup) const override {
     return false;
   }
   void visitUsedExpr(MCStreamer &Streamer) const override {};
-  MCSection *FindAssociatedSection() const override { return nullptr; }
+  MCSection *findAssociatedSection() const override { return nullptr; }
 
   // There are no TLS NVPTXMCExprs at the moment.
   void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index ac27c30aabab..a6466687bc7b 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -53,7 +53,7 @@ void initializeGenericToNVVMPass(PassRegistry&);
 void initializeNVPTXAllocaHoistingPass(PassRegistry &);
 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
 void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
-void initializeNVPTXLowerStructArgsPass(PassRegistry &);
+void initializeNVPTXLowerKernelArgsPass(PassRegistry &);
 }
 
 extern "C" void LLVMInitializeNVPTXTarget() {
@@ -69,7 +69,7 @@ extern "C" void LLVMInitializeNVPTXTarget() {
   initializeNVPTXAssignValidGlobalNamesPass(*PassRegistry::getPassRegistry());
   initializeNVPTXFavorNonGenericAddrSpacesPass(
     *PassRegistry::getPassRegistry());
-  initializeNVPTXLowerStructArgsPass(*PassRegistry::getPassRegistry());
+  initializeNVPTXLowerKernelArgsPass(*PassRegistry::getPassRegistry());
 }
 
 static std::string computeDataLayout(bool is64Bit) {
@@ -163,7 +163,13 @@ void NVPTXPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
   addPass(createNVPTXAssignValidGlobalNamesPass());
   addPass(createGenericToNVVMPass());
+  addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine()));
   addPass(createNVPTXFavorNonGenericAddrSpacesPass());
+  // NVPTXLowerKernelArgs emits alloca for byval parameters which can often
+  // be eliminated by SROA. We do not run SROA right after NVPTXLowerKernelArgs
+  // because we plan to merge NVPTXLowerKernelArgs and
+  // NVPTXFavorNonGenericAddrSpaces into one pass.
+  addPass(createSROAPass());
   // FavorNonGenericAddrSpaces shortcuts unnecessary addrspacecasts, and leave
   // them unused. We could remove dead code in an ad-hoc manner, but that
   // requires manual work and might be error-prone.
@@ -181,6 +187,9 @@ void NVPTXPassConfig::addIRPasses() {
     addPass(createEarlyCSEPass());
   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
   addPass(createNaryReassociatePass());
+  // NaryReassociate on GEPs creates redundant common expressions, so run
+  // EarlyCSE after it.
+  addPass(createEarlyCSEPass());
 }
 
 bool NVPTXPassConfig::addInstSelector() {
diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 83de4d996993..1736d03961f7 100644
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/SourceMgr.h"
@@ -772,7 +773,7 @@ public:
 
     if (const PPCMCExpr *TE = dyn_cast<PPCMCExpr>(Val)) {
       int64_t Res;
-      if (TE->EvaluateAsConstant(Res))
+      if (TE->evaluateAsConstant(Res))
         return CreateContextImm(Res, S, E, IsPPC64);
     }
 
@@ -814,13 +815,13 @@ addNegOperand(MCInst &Inst, MCOperand &Op, MCContext &Ctx) {
     }
   } else if (const MCBinaryExpr *BinExpr = dyn_cast<MCBinaryExpr>(Expr)) {
     if (BinExpr->getOpcode() == MCBinaryExpr::Sub) {
-      const MCExpr *NE = MCBinaryExpr::CreateSub(BinExpr->getRHS(),
+      const MCExpr *NE = MCBinaryExpr::createSub(BinExpr->getRHS(),
                                                  BinExpr->getLHS(), Ctx);
       Inst.addOperand(MCOperand::createExpr(NE));
       return;
     }
   }
-  Inst.addOperand(MCOperand::createExpr(MCUnaryExpr::CreateMinus(Expr, Ctx)));
+  Inst.addOperand(MCOperand::createExpr(MCUnaryExpr::createMinus(Expr, Ctx)));
 }
 
 void PPCAsmParser::ProcessInstruction(MCInst &Inst,
@@ -1330,7 +1331,7 @@ ExtractModifierFromExpr(const MCExpr *E,
       return nullptr;
     }
 
-    return MCSymbolRefExpr::Create(&SRE->getSymbol(), Context);
+    return MCSymbolRefExpr::create(&SRE->getSymbol(), Context);
   }
 
   case MCExpr::Unary: {
@@ -1338,7 +1339,7 @@ ExtractModifierFromExpr(const MCExpr *E,
     const MCExpr *Sub = ExtractModifierFromExpr(UE->getSubExpr(), Variant);
     if (!Sub)
       return nullptr;
-    return MCUnaryExpr::Create(UE->getOpcode(), Sub, Context);
+    return MCUnaryExpr::create(UE->getOpcode(), Sub, Context);
   }
 
   case MCExpr::Binary: {
@@ -1362,7 +1363,7 @@ ExtractModifierFromExpr(const MCExpr *E,
     else
       return nullptr;
 
-    return MCBinaryExpr::Create(BE->getOpcode(), LHS, RHS, Context);
+    return MCBinaryExpr::create(BE->getOpcode(), LHS, RHS, Context);
   }
   }
 
@@ -1396,7 +1397,7 @@ FixupVariantKind(const MCExpr *E) {
     default:
       return E;
     }
-    return MCSymbolRefExpr::Create(&SRE->getSymbol(), Variant, Context);
+    return MCSymbolRefExpr::create(&SRE->getSymbol(), Variant, Context);
   }
 
   case MCExpr::Unary: {
@@ -1404,7 +1405,7 @@ FixupVariantKind(const MCExpr *E) {
     const MCExpr *Sub = FixupVariantKind(UE->getSubExpr());
     if (Sub == UE->getSubExpr())
       return E;
-    return MCUnaryExpr::Create(UE->getOpcode(), Sub, Context);
+    return MCUnaryExpr::create(UE->getOpcode(), Sub, Context);
   }
 
   case MCExpr::Binary: {
@@ -1413,7 +1414,7 @@ FixupVariantKind(const MCExpr *E) {
     const MCExpr *RHS = FixupVariantKind(BE->getRHS());
     if (LHS == BE->getLHS() && RHS == BE->getRHS())
       return E;
-    return MCBinaryExpr::Create(BE->getOpcode(), LHS, RHS, Context);
+    return MCBinaryExpr::create(BE->getOpcode(), LHS, RHS, Context);
   }
   }
 
@@ -1438,7 +1439,7 @@ ParseExpression(const MCExpr *&EVal) {
   PPCMCExpr::VariantKind Variant;
   const MCExpr *E = ExtractModifierFromExpr(EVal, Variant);
   if (E)
-    EVal = PPCMCExpr::Create(Variant, E, false, getParser().getContext());
+    EVal = PPCMCExpr::create(Variant, E, false, getParser().getContext());
 
   return false;
 }
@@ -1485,7 +1486,7 @@ ParseDarwinExpression(const MCExpr *&EVal) {
     if (getLexer().isNot(AsmToken::RParen))
       return Error(Parser.getTok().getLoc(), "expected ')'");
     Parser.Lex(); // Eat the ')'
-    EVal = PPCMCExpr::Create(Variant, EVal, false, getParser().getContext());
+    EVal = PPCMCExpr::create(Variant, EVal, false, getParser().getContext());
   }
   return false;
 }
@@ -1863,7 +1864,7 @@ bool PPCAsmParser::ParseDirectiveLocalEntry(SMLoc L) {
     Error(L, "expected identifier in directive");
     return false;
   }
-  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+  MCSymbolELF *Sym = cast<MCSymbolELF>(getContext().getOrCreateSymbol(Name));
 
   if (getLexer().isNot(AsmToken::Comma)) {
     Error(L, "unexpected token in directive");
@@ -1936,19 +1937,19 @@ PPCAsmParser::applyModifierToExpr(const MCExpr *E,
                                   MCContext &Ctx) {
   switch (Variant) {
   case MCSymbolRefExpr::VK_PPC_LO:
-    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_LO, E, false, Ctx);
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_LO, E, false, Ctx);
   case MCSymbolRefExpr::VK_PPC_HI:
-    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HI, E, false, Ctx);
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HI, E, false, Ctx);
   case MCSymbolRefExpr::VK_PPC_HA:
-    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HA, E, false, Ctx);
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HA, E, false, Ctx);
   case MCSymbolRefExpr::VK_PPC_HIGHER:
-    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHER, E, false, Ctx);
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHER, E, false, Ctx);
   case MCSymbolRefExpr::VK_PPC_HIGHERA:
-    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHERA, E, false, Ctx);
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHERA, E, false, Ctx);
   case MCSymbolRefExpr::VK_PPC_HIGHEST:
-    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHEST, E, false, Ctx);
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHEST, E, false, Ctx);
   case MCSymbolRefExpr::VK_PPC_HIGHESTA:
-    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHESTA, E, false, Ctx);
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHESTA, E, false, Ctx);
   default:
     return nullptr;
   }
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 1a130e87bf3e..5e1d22789056 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -445,6 +445,6 @@ void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
   
   assert(Op.isExpr() && "unknown operand kind in printOperand");
-  O << *Op.getExpr();
+  Op.getExpr()->print(O, &MAI);
 }
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 86885e111dd1..72742dc3ee20 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -11,12 +11,12 @@
 #include "MCTargetDesc/PPCFixupKinds.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCELF.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -142,13 +142,14 @@ public:
       // to resolve the fixup directly.  Emit a relocation and leave
       // resolution of the final target address to the linker.
       if (const MCSymbolRefExpr *A = Target.getSymA()) {
-        const MCSymbolData &Data = Asm.getSymbolData(A->getSymbol());
-        // The "other" values are stored in the last 6 bits of the second byte.
-        // The traditional defines for STO values assume the full byte and thus
-        // the shift to pack it.
-        unsigned Other = MCELF::getOther(Data) << 2;
-        if ((Other & ELF::STO_PPC64_LOCAL_MASK) != 0)
-          IsResolved = false;
+        if (const auto *S = dyn_cast<MCSymbolELF>(&A->getSymbol())) {
+          // The "other" values are stored in the last 6 bits of the second
+          // byte. The traditional defines for STO values assume the full byte
+          // and thus the shift to pack it.
+          unsigned Other = S->getOther() << 2;
+          if ((Other & ELF::STO_PPC64_LOCAL_MASK) != 0)
+            IsResolved = false;
+        }
       }
       break;
     }
@@ -176,7 +177,7 @@ public:
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
     uint64_t NumNops = Count / 4;
     for (uint64_t i = 0; i != NumNops; ++i)
-      OW->Write32(0x60000000);
+      OW->write32(0x60000000);
 
     OW->WriteZeros(Count % 4);
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 3e3489fc46aa..992be5b966c1 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -11,9 +11,9 @@
 #include "MCTargetDesc/PPCFixupKinds.h"
 #include "MCTargetDesc/PPCMCExpr.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/MC/MCELF.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -28,7 +28,7 @@ namespace {
     unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                           bool IsPCRel) const override;
 
-    bool needsRelocateWithSymbol(const MCSymbolData &SD,
+    bool needsRelocateWithSymbol(const MCSymbol &Sym,
                                  unsigned Type) const override;
   };
 }
@@ -395,7 +395,7 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
   return Type;
 }
 
-bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCSymbolData &SD,
+bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
                                                  unsigned Type) const {
   switch (Type) {
     default:
@@ -407,7 +407,7 @@ bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCSymbolData &SD,
       // The "other" values are stored in the last 6 bits of the second byte.
       // The traditional defines for STO values assume the full byte and thus
       // the shift to pack it.
-      unsigned Other = MCELF::getOther(SD) << 2;
+      unsigned Other = cast<MCSymbolELF>(Sym).getOther() << 2;
       return (Other & ELF::STO_PPC64_LOCAL_MASK) != 0;
   }
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 17f4cd421641..95379246f301 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOpcodes.h"
@@ -116,38 +117,19 @@ public:
     switch (Size) {
     case 4:
       if (IsLittleEndian) {
-        OS << (char)(Bits);
-        OS << (char)(Bits >> 8);
-        OS << (char)(Bits >> 16);
-        OS << (char)(Bits >> 24);
+        support::endian::Writer<support::little>(OS).write<uint32_t>(Bits);
       } else {
-        OS << (char)(Bits >> 24);
-        OS << (char)(Bits >> 16);
-        OS << (char)(Bits >> 8);
-        OS << (char)(Bits);
+        support::endian::Writer<support::big>(OS).write<uint32_t>(Bits);
       }
       break;
     case 8:
       // If we emit a pair of instructions, the first one is
       // always in the top 32 bits, even on little-endian.
       if (IsLittleEndian) {
-        OS << (char)(Bits >> 32);
-        OS << (char)(Bits >> 40);
-        OS << (char)(Bits >> 48);
-        OS << (char)(Bits >> 56);
-        OS << (char)(Bits);
-        OS << (char)(Bits >> 8);
-        OS << (char)(Bits >> 16);
-        OS << (char)(Bits >> 24);
+        uint64_t Swapped = (Bits << 32) | (Bits >> 32);
+        support::endian::Writer<support::little>(OS).write<uint64_t>(Swapped);
       } else {
-        OS << (char)(Bits >> 56);
-        OS << (char)(Bits >> 48);
-        OS << (char)(Bits >> 40);
-        OS << (char)(Bits >> 32);
-        OS << (char)(Bits >> 24);
-        OS << (char)(Bits >> 16);
-        OS << (char)(Bits >> 8);
-        OS << (char)(Bits);
+        support::endian::Writer<support::big>(OS).write<uint64_t>(Bits);
       }
       break;
     default:
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index 7204befe15ee..6b97d4c1456b 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -19,12 +19,12 @@ using namespace llvm;
 #define DEBUG_TYPE "ppcmcexpr"
 
 const PPCMCExpr*
-PPCMCExpr::Create(VariantKind Kind, const MCExpr *Expr,
+PPCMCExpr::create(VariantKind Kind, const MCExpr *Expr,
                   bool isDarwin, MCContext &Ctx) {
   return new (Ctx) PPCMCExpr(Kind, Expr, isDarwin);
 }
 
-void PPCMCExpr::PrintImpl(raw_ostream &OS) const {
+void PPCMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   if (isDarwinSyntax()) {
     switch (Kind) {
     default: llvm_unreachable("Invalid kind!");
@@ -34,10 +34,10 @@ void PPCMCExpr::PrintImpl(raw_ostream &OS) const {
     }
 
     OS << '(';
-    getSubExpr()->print(OS);
+    getSubExpr()->print(OS, MAI);
     OS << ')';
   } else {
-    getSubExpr()->print(OS);
+    getSubExpr()->print(OS, MAI);
 
     switch (Kind) {
     default: llvm_unreachable("Invalid kind!");
@@ -53,21 +53,21 @@ void PPCMCExpr::PrintImpl(raw_ostream &OS) const {
 }
 
 bool
-PPCMCExpr::EvaluateAsConstant(int64_t &Res) const {
+PPCMCExpr::evaluateAsConstant(int64_t &Res) const {
   MCValue Value;
 
-  if (!getSubExpr()->EvaluateAsRelocatable(Value, nullptr, nullptr))
+  if (!getSubExpr()->evaluateAsRelocatable(Value, nullptr, nullptr))
     return false;
 
   if (!Value.isAbsolute())
     return false;
 
-  Res = EvaluateAsInt64(Value.getConstant());
+  Res = evaluateAsInt64(Value.getConstant());
   return true;
 }
 
 int64_t
-PPCMCExpr::EvaluateAsInt64(int64_t Value) const {
+PPCMCExpr::evaluateAsInt64(int64_t Value) const {
   switch (Kind) {
     case VK_PPC_LO:
       return Value & 0xffff;
@@ -90,16 +90,16 @@ PPCMCExpr::EvaluateAsInt64(int64_t Value) const {
 }
 
 bool
-PPCMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+PPCMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
                                      const MCAsmLayout *Layout,
                                      const MCFixup *Fixup) const {
   MCValue Value;
 
-  if (!getSubExpr()->EvaluateAsRelocatable(Value, Layout, Fixup))
+  if (!getSubExpr()->evaluateAsRelocatable(Value, Layout, Fixup))
     return false;
 
   if (Value.isAbsolute()) {
-    int64_t Result = EvaluateAsInt64(Value.getConstant());
+    int64_t Result = evaluateAsInt64(Value.getConstant());
     if ((Fixup == nullptr || (unsigned)Fixup->getKind() != PPC::fixup_ppc_half16) &&
         (Result >= 0x8000))
       return false;
@@ -138,7 +138,7 @@ PPCMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
         Modifier = MCSymbolRefExpr::VK_PPC_HIGHESTA;
         break;
     }
-    Sym = MCSymbolRefExpr::Create(&Sym->getSymbol(), Modifier, Context);
+    Sym = MCSymbolRefExpr::create(&Sym->getSymbol(), Modifier, Context);
     Res = MCValue::get(Sym, Value.getSymB(), Value.getConstant());
   }
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
index ca72ccf0f76e..a641780516b3 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -34,7 +34,7 @@ private:
   const MCExpr *Expr;
   bool IsDarwin;
 
-  int64_t EvaluateAsInt64(int64_t Value) const;
+  int64_t evaluateAsInt64(int64_t Value) const;
 
   explicit PPCMCExpr(VariantKind Kind, const MCExpr *Expr, bool IsDarwin)
       : Kind(Kind), Expr(Expr), IsDarwin(IsDarwin) {}
@@ -43,22 +43,22 @@ public:
   /// @name Construction
   /// @{
 
-  static const PPCMCExpr *Create(VariantKind Kind, const MCExpr *Expr,
+  static const PPCMCExpr *create(VariantKind Kind, const MCExpr *Expr,
                                  bool isDarwin, MCContext &Ctx);
 
-  static const PPCMCExpr *CreateLo(const MCExpr *Expr,
+  static const PPCMCExpr *createLo(const MCExpr *Expr,
                                    bool isDarwin, MCContext &Ctx) {
-    return Create(VK_PPC_LO, Expr, isDarwin, Ctx);
+    return create(VK_PPC_LO, Expr, isDarwin, Ctx);
   }
 
-  static const PPCMCExpr *CreateHi(const MCExpr *Expr,
+  static const PPCMCExpr *createHi(const MCExpr *Expr,
                                    bool isDarwin, MCContext &Ctx) {
-    return Create(VK_PPC_HI, Expr, isDarwin, Ctx);
+    return create(VK_PPC_HI, Expr, isDarwin, Ctx);
   }
 
-  static const PPCMCExpr *CreateHa(const MCExpr *Expr,
+  static const PPCMCExpr *createHa(const MCExpr *Expr,
                                    bool isDarwin, MCContext &Ctx) {
-    return Create(VK_PPC_HA, Expr, isDarwin, Ctx);
+    return create(VK_PPC_HA, Expr, isDarwin, Ctx);
   }
 
   /// @}
@@ -77,19 +77,19 @@ public:
 
   /// @}
 
-  void PrintImpl(raw_ostream &OS) const override;
-  bool EvaluateAsRelocatableImpl(MCValue &Res,
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAsmLayout *Layout,
                                  const MCFixup *Fixup) const override;
   void visitUsedExpr(MCStreamer &Streamer) const override;
-  MCSection *FindAssociatedSection() const override {
-    return getSubExpr()->FindAssociatedSection();
+  MCSection *findAssociatedSection() const override {
+    return getSubExpr()->findAssociatedSection();
   }
 
   // There are no TLS PPCMCExprs at the moment.
   void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
 
-  bool EvaluateAsConstant(int64_t &Res) const;
+  bool evaluateAsConstant(int64_t &Res) const;
 
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 847437611a5f..1e8e8046669d 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -16,14 +16,14 @@
 #include "PPCMCAsmInfo.h"
 #include "PPCTargetStreamer.h"
 #include "llvm/MC/MCCodeGenInfo.h"
-#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -70,8 +70,8 @@ static MCSubtargetInfo *createPPCMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
-  Triple TheTriple(TT);
+static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI,
+                                     const Triple &TheTriple) {
   bool isPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
                   TheTriple.getArch() == Triple::ppc64le);
 
@@ -132,8 +132,14 @@ public:
   void emitAbiVersion(int AbiVersion) override {
     OS << "\t.abiversion " << AbiVersion << '\n';
   }
-  void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) override {
-    OS << "\t.localentry\t" << *S << ", " << *LocalOffset << '\n';
+  void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override {
+    const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo();
+
+    OS << "\t.localentry\t";
+    S->print(OS, MAI);
+    OS << ", ";
+    LocalOffset->print(OS, MAI);
+    OS << '\n';
   }
 };
 
@@ -159,25 +165,21 @@ public:
     Flags |= (AbiVersion & ELF::EF_PPC64_ABI);
     MCA.setELFHeaderEFlags(Flags);
   }
-  void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) override {
+  void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override {
     MCAssembler &MCA = getStreamer().getAssembler();
-    MCSymbolData &Data = getStreamer().getOrCreateSymbolData(S);
 
     int64_t Res;
-    if (!LocalOffset->EvaluateAsAbsolute(Res, MCA))
+    if (!LocalOffset->evaluateAsAbsolute(Res, MCA))
       report_fatal_error(".localentry expression must be absolute.");
 
     unsigned Encoded = ELF::encodePPC64LocalEntryOffset(Res);
     if (Res != ELF::decodePPC64LocalEntryOffset(Encoded))
       report_fatal_error(".localentry expression cannot be encoded.");
 
-    // The "other" values are stored in the last 6 bits of the second byte.
-    // The traditional defines for STO values assume the full byte and thus
-    // the shift to pack it.
-    unsigned Other = MCELF::getOther(Data) << 2;
+    unsigned Other = S->getOther();
     Other &= ~ELF::STO_PPC64_LOCAL_MASK;
     Other |= Encoded;
-    MCELF::setOther(Data, Other >> 2);
+    S->setOther(Other);
 
     // For GAS compatibility, unless we already saw a .abiversion directive,
     // set e_flags to indicate ELFv2 ABI.
@@ -185,22 +187,18 @@ public:
     if ((Flags & ELF::EF_PPC64_ABI) == 0)
       MCA.setELFHeaderEFlags(Flags | 2);
   }
-  void emitAssignment(MCSymbol *Symbol, const MCExpr *Value) override {
+  void emitAssignment(MCSymbol *S, const MCExpr *Value) override {
+    auto *Symbol = cast<MCSymbolELF>(S);
     // When encoding an assignment to set symbol A to symbol B, also copy
     // the st_other bits encoding the local entry point offset.
     if (Value->getKind() != MCExpr::SymbolRef)
       return;
-    const MCSymbol &RhsSym =
-        static_cast<const MCSymbolRefExpr *>(Value)->getSymbol();
-    MCSymbolData &Data = getStreamer().getOrCreateSymbolData(&RhsSym);
-    MCSymbolData &SymbolData = getStreamer().getOrCreateSymbolData(Symbol);
-    // The "other" values are stored in the last 6 bits of the second byte.
-    // The traditional defines for STO values assume the full byte and thus
-    // the shift to pack it.
-    unsigned Other = MCELF::getOther(SymbolData) << 2;
+    const auto &RhsSym = cast<MCSymbolELF>(
+        static_cast<const MCSymbolRefExpr *>(Value)->getSymbol());
+    unsigned Other = Symbol->getOther();
     Other &= ~ELF::STO_PPC64_LOCAL_MASK;
-    Other |= (MCELF::getOther(Data) << 2) & ELF::STO_PPC64_LOCAL_MASK;
-    MCELF::setOther(SymbolData, Other >> 2);
+    Other |= RhsSym.getOther() & ELF::STO_PPC64_LOCAL_MASK;
+    Symbol->setOther(Other);
   }
 };
 
@@ -217,7 +215,7 @@ public:
   void emitAbiVersion(int AbiVersion) override {
     llvm_unreachable("Unknown pseudo-op: .abiversion");
   }
-  void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) override {
+  void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override {
     llvm_unreachable("Unknown pseudo-op: .localentry");
   }
 };
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
index 3c906d2a51e3..9d7289658f0f 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -24,7 +24,7 @@ using namespace llvm;
 
 namespace {
 class PPCMachObjectWriter : public MCMachObjectTargetWriter {
-  bool RecordScatteredRelocation(MachObjectWriter *Writer,
+  bool recordScatteredRelocation(MachObjectWriter *Writer,
                                  const MCAssembler &Asm,
                                  const MCAsmLayout &Layout,
                                  const MCFragment *Fragment,
@@ -38,10 +38,9 @@ class PPCMachObjectWriter : public MCMachObjectTargetWriter {
 
 public:
   PPCMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype)
-      : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
-                                 /*UseAggressiveSymbolFolding=*/Is64Bit) {}
+      : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {}
 
-  void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+  void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
                         const MCAsmLayout &Layout, const MCFragment *Fragment,
                         const MCFixup &Fixup, MCValue Target,
                         uint64_t &FixedValue) override {
@@ -187,9 +186,9 @@ static uint32_t getFixupOffset(const MCAsmLayout &Layout,
 
 /// \return false if falling back to using non-scattered relocation,
 /// otherwise true for normal scattered relocation.
-/// based on X86MachObjectWriter::RecordScatteredRelocation
-/// and ARMMachObjectWriter::RecordScatteredRelocation
-bool PPCMachObjectWriter::RecordScatteredRelocation(
+/// based on X86MachObjectWriter::recordScatteredRelocation
+/// and ARMMachObjectWriter::recordScatteredRelocation
+bool PPCMachObjectWriter::recordScatteredRelocation(
     MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
     const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
     unsigned Log2Size, uint64_t &FixedValue) {
@@ -206,28 +205,26 @@ bool PPCMachObjectWriter::RecordScatteredRelocation(
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
-  const MCSymbolData *A_SD = &Asm.getSymbolData(*A);
 
-  if (!A_SD->getFragment())
+  if (!A->getFragment())
     report_fatal_error("symbol '" + A->getName() +
                        "' can not be undefined in a subtraction expression");
 
   uint32_t Value = Writer->getSymbolAddress(*A, Layout);
-  uint64_t SecAddr =
-      Writer->getSectionAddress(A_SD->getFragment()->getParent());
+  uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
   FixedValue += SecAddr;
   uint32_t Value2 = 0;
 
   if (const MCSymbolRefExpr *B = Target.getSymB()) {
-    const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+    const MCSymbol *SB = &B->getSymbol();
 
-    if (!B_SD->getFragment())
+    if (!SB->getFragment())
       report_fatal_error("symbol '" + B->getSymbol().getName() +
                          "' can not be undefined in a subtraction expression");
 
     // FIXME: is Type correct? see include/llvm/Support/MachO.h
     Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
-    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
+    FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
   }
   // FIXME: does FixedValue get used??
 
@@ -253,7 +250,7 @@ bool PPCMachObjectWriter::RecordScatteredRelocation(
     }
 
     // Is this supposed to follow MCTarget/PPCAsmBackend.cpp:adjustFixupValue()?
-    // see PPCMCExpr::EvaluateAsRelocatableImpl()
+    // see PPCMCExpr::evaluateAsRelocatableImpl()
     uint32_t other_half = 0;
     switch (Type) {
     case MachO::PPC_RELOC_LO16_SECTDIFF:
@@ -317,7 +314,7 @@ void PPCMachObjectWriter::RecordPPCRelocation(
       // Q: are branch targets ever scattered?
       RelocType != MachO::PPC_RELOC_BR24 &&
       RelocType != MachO::PPC_RELOC_BR14) {
-    RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+    recordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
                               Log2Size, FixedValue);
     return;
   }
@@ -346,7 +343,7 @@ void PPCMachObjectWriter::RecordPPCRelocation(
     // Resolve constant variables.
     if (A->isVariable()) {
       int64_t Res;
-      if (A->getVariableValue()->EvaluateAsAbsolute(
+      if (A->getVariableValue()->evaluateAsAbsolute(
               Res, Layout, Writer->getSectionAddressMap())) {
         FixedValue = Res;
         return;
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 4f1c3c73e710..b42b0f9ef478 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -49,7 +49,7 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
@@ -181,14 +181,14 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
     return;
 
   case MachineOperand::MO_MachineBasicBlock:
-    O << *MO.getMBB()->getSymbol();
+    MO.getMBB()->getSymbol()->print(O, MAI);
     return;
   case MachineOperand::MO_ConstantPoolIndex:
     O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
       << '_' << MO.getIndex();
     return;
   case MachineOperand::MO_BlockAddress:
-    O << *GetBlockAddressSymbol(MO.getBlockAddress());
+    GetBlockAddressSymbol(MO.getBlockAddress())->print(O, MAI);
     return;
   case MachineOperand::MO_GlobalAddress: {
     // Computing the address of a global symbol, not calling it.
@@ -222,8 +222,8 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
     } else {
       SymToPrint = getSymbol(GV);
     }
-    
-    O << *SymToPrint;
+
+    SymToPrint->print(O, MAI);
 
     printOffset(MO.getOffset(), O);
     return;
@@ -422,11 +422,11 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI,
       TM.getRelocationModel() == Reloc::PIC_)
     Kind = MCSymbolRefExpr::VK_PLT;
   const MCSymbolRefExpr *TlsRef =
-    MCSymbolRefExpr::Create(TlsGetAddr, Kind, OutContext);
+    MCSymbolRefExpr::create(TlsGetAddr, Kind, OutContext);
   const MachineOperand &MO = MI->getOperand(2);
   const GlobalValue *GValue = MO.getGlobal();
   MCSymbol *MOSymbol = getSymbol(GValue);
-  const MCExpr *SymVar = MCSymbolRefExpr::Create(MOSymbol, VK, OutContext);
+  const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, VK, OutContext);
   EmitToStreamer(*OutStreamer,
                  MCInstBuilder(Subtarget->isPPC64() ?
                                PPC::BL8_NOP_TLS : PPC::BL_TLS)
@@ -464,10 +464,10 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     MCSymbol *GOTSymbol =
       OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
     const MCExpr *OffsExpr =
-      MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(GOTSymbol,
+      MCBinaryExpr::createSub(MCSymbolRefExpr::create(GOTSymbol,
                                                       MCSymbolRefExpr::VK_PPC_LOCAL,
                                                       OutContext),
-                              MCConstantExpr::Create(4, OutContext),
+                              MCConstantExpr::create(4, OutContext),
                               OutContext);
 
     // Emit the 'bl'.
@@ -486,7 +486,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL)
       // FIXME: We would like an efficient form for this, so we don't have to do
       // a lot of extra uniquing.
-      .addExpr(MCSymbolRefExpr::Create(PICBase, OutContext)));
+      .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
     
     // Emit the label.
     OutStreamer->EmitLabel(PICBase);
@@ -502,9 +502,9 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MF->getInfo<PPCFunctionInfo>()->getPICOffsetSymbol();
     TmpInst.setOpcode(PPC::LWZ);
     const MCExpr *Exp =
-      MCSymbolRefExpr::Create(PICOffset, MCSymbolRefExpr::VK_None, OutContext);
+      MCSymbolRefExpr::create(PICOffset, MCSymbolRefExpr::VK_None, OutContext);
     const MCExpr *PB =
-      MCSymbolRefExpr::Create(MF->getPICBaseSymbol(),
+      MCSymbolRefExpr::create(MF->getPICBaseSymbol(),
                               MCSymbolRefExpr::VK_None,
                               OutContext);
     const MCOperand TR = TmpInst.getOperand(1);
@@ -512,7 +512,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     // Step 1: lwz %Rt, .L$poff - .L$pb(%Ri)
     TmpInst.getOperand(1) =
-        MCOperand::createExpr(MCBinaryExpr::CreateSub(Exp, PB, OutContext));
+        MCOperand::createExpr(MCBinaryExpr::createSub(Exp, PB, OutContext));
     TmpInst.getOperand(0) = TR;
     TmpInst.getOperand(2) = PICR;
     EmitToStreamer(*OutStreamer, TmpInst);
@@ -547,19 +547,19 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     if (PL == PICLevel::Small) {
       const MCExpr *Exp =
-        MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_GOT,
+        MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_GOT,
                                 OutContext);
       TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
     } else {
       MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
 
       const MCExpr *Exp =
-        MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_None,
+        MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_None,
                                 OutContext);
       const MCExpr *PB =
-        MCSymbolRefExpr::Create(OutContext.getOrCreateSymbol(Twine(".LTOC")),
+        MCSymbolRefExpr::create(OutContext.getOrCreateSymbol(Twine(".LTOC")),
                                                              OutContext);
-      Exp = MCBinaryExpr::CreateSub(Exp, PB, OutContext);
+      Exp = MCBinaryExpr::createSub(Exp, PB, OutContext);
       TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
     }
     EmitToStreamer(*OutStreamer, TmpInst);
@@ -592,7 +592,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
 
     const MCExpr *Exp =
-      MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC,
+      MCSymbolRefExpr::create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC,
                               OutContext);
     TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
@@ -639,7 +639,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
 
     const MCExpr *Exp =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_HA,
+      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_HA,
                               OutContext);
     TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
@@ -681,7 +681,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
 
     const MCExpr *Exp =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
+      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
                               OutContext);
     TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
@@ -715,7 +715,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
 
     const MCExpr *Exp =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
+      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
                               OutContext);
     TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
@@ -729,7 +729,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTprel =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA,
+      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA,
                               OutContext);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
                                  .addReg(MI->getOperand(0).getReg())
@@ -748,7 +748,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *Exp =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO,
+      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO,
                               OutContext);
     TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
@@ -763,10 +763,10 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL)
       // FIXME: We would like an efficient form for this, so we don't have to do
       // a lot of extra uniquing.
-      .addExpr(MCSymbolRefExpr::Create(NextInstr, OutContext)));
+      .addExpr(MCSymbolRefExpr::create(NextInstr, OutContext)));
     const MCExpr *OffsExpr =
-      MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(GOTSymbol, OutContext),
-                                MCSymbolRefExpr::Create(GOTRef, OutContext),
+      MCBinaryExpr::createSub(MCSymbolRefExpr::create(GOTSymbol, OutContext),
+                                MCSymbolRefExpr::create(GOTRef, OutContext),
         OutContext);
     OutStreamer->EmitLabel(GOTRef);
     OutStreamer->EmitValue(OffsExpr, 4);
@@ -786,10 +786,10 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case PPC::PPC32GOT: {
     MCSymbol *GOTSymbol = OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
     const MCExpr *SymGotTlsL =
-      MCSymbolRefExpr::Create(GOTSymbol, MCSymbolRefExpr::VK_PPC_LO,
+      MCSymbolRefExpr::create(GOTSymbol, MCSymbolRefExpr::VK_PPC_LO,
                               OutContext);
     const MCExpr *SymGotTlsHA =                               
-      MCSymbolRefExpr::Create(GOTSymbol, MCSymbolRefExpr::VK_PPC_HA,
+      MCSymbolRefExpr::create(GOTSymbol, MCSymbolRefExpr::VK_PPC_HA,
                               OutContext);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI)
                                  .addReg(MI->getOperand(0).getReg())
@@ -808,7 +808,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsGD =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA,
+      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA,
                               OutContext);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
                                  .addReg(MI->getOperand(0).getReg())
@@ -825,7 +825,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
-    const MCExpr *SymGotTlsGD = MCSymbolRefExpr::Create(
+    const MCExpr *SymGotTlsGD = MCSymbolRefExpr::create(
         MOSymbol, Subtarget->isPPC64() ? MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO
                                        : MCSymbolRefExpr::VK_PPC_GOT_TLSGD,
         OutContext);
@@ -853,7 +853,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsLD =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA,
+      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA,
                               OutContext);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
                                  .addReg(MI->getOperand(0).getReg())
@@ -870,7 +870,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
-    const MCExpr *SymGotTlsLD = MCSymbolRefExpr::Create(
+    const MCExpr *SymGotTlsLD = MCSymbolRefExpr::create(
         MOSymbol, Subtarget->isPPC64() ? MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO
                                        : MCSymbolRefExpr::VK_PPC_GOT_TLSLD,
         OutContext);
@@ -900,7 +900,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymDtprel =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_HA,
+      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_HA,
                               OutContext);
     EmitToStreamer(
         *OutStreamer,
@@ -920,7 +920,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymDtprel =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_LO,
+      MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_LO,
                               OutContext);
     EmitToStreamer(*OutStreamer,
                    MCInstBuilder(Subtarget->isPPC64() ? PPC::ADDI8 : PPC::ADDI)
@@ -1012,8 +1012,8 @@ void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
   // The GOT pointer points to the middle of the GOT, in order to reference the
   // entire 64kB range.  0x8000 is the midpoint.
   const MCExpr *tocExpr =
-    MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(CurrentPos, OutContext),
-                            MCConstantExpr::Create(0x8000, OutContext),
+    MCBinaryExpr::createAdd(MCSymbolRefExpr::create(CurrentPos, OutContext),
+                            MCConstantExpr::create(0x8000, OutContext),
                             OutContext);
 
   OutStreamer->EmitAssignment(TOCSym, tocExpr);
@@ -1036,10 +1036,10 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
       OutStreamer->EmitLabel(RelocSymbol);
 
       const MCExpr *OffsExpr =
-        MCBinaryExpr::CreateSub(
-          MCSymbolRefExpr::Create(OutContext.getOrCreateSymbol(Twine(".LTOC")),
+        MCBinaryExpr::createSub(
+          MCSymbolRefExpr::create(OutContext.getOrCreateSymbol(Twine(".LTOC")),
                                                                OutContext),
-                                  MCSymbolRefExpr::Create(PICBase, OutContext),
+                                  MCSymbolRefExpr::create(PICBase, OutContext),
           OutContext);
       OutStreamer->EmitValue(OffsExpr, 4);
       OutStreamer->EmitLabel(CurrentFnSym);
@@ -1062,12 +1062,12 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
   MCSymbol *Symbol1 = CurrentFnSymForSize;
   // Generates a R_PPC64_ADDR64 (from FK_DATA_8) relocation for the function
   // entry point.
-  OutStreamer->EmitValue(MCSymbolRefExpr::Create(Symbol1, OutContext),
+  OutStreamer->EmitValue(MCSymbolRefExpr::create(Symbol1, OutContext),
                          8 /*size*/);
   MCSymbol *Symbol2 = OutContext.getOrCreateSymbol(StringRef(".TOC."));
   // Generates a R_PPC64_TOC relocation for TOC base insertion.
   OutStreamer->EmitValue(
-    MCSymbolRefExpr::Create(Symbol2, MCSymbolRefExpr::VK_PPC_TOCBASE, OutContext),
+    MCSymbolRefExpr::create(Symbol2, MCSymbolRefExpr::VK_PPC_TOCBASE, OutContext),
     8/*size*/);
   // Emit a null environment pointer.
   OutStreamer->EmitIntValue(0, 8 /* size */);
@@ -1133,22 +1133,22 @@ void PPCLinuxAsmPrinter::EmitFunctionBodyStart() {
     MCSymbol *GlobalEntryLabel = OutContext.createTempSymbol();
     OutStreamer->EmitLabel(GlobalEntryLabel);
     const MCSymbolRefExpr *GlobalEntryLabelExp =
-      MCSymbolRefExpr::Create(GlobalEntryLabel, OutContext);
+      MCSymbolRefExpr::create(GlobalEntryLabel, OutContext);
 
     MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC."));
     const MCExpr *TOCDeltaExpr =
-      MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(TOCSymbol, OutContext),
+      MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext),
                               GlobalEntryLabelExp, OutContext);
 
     const MCExpr *TOCDeltaHi =
-      PPCMCExpr::CreateHa(TOCDeltaExpr, false, OutContext);
+      PPCMCExpr::createHa(TOCDeltaExpr, false, OutContext);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
                                  .addReg(PPC::X2)
                                  .addReg(PPC::X12)
                                  .addExpr(TOCDeltaHi));
 
     const MCExpr *TOCDeltaLo =
-      PPCMCExpr::CreateLo(TOCDeltaExpr, false, OutContext);
+      PPCMCExpr::createLo(TOCDeltaExpr, false, OutContext);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI)
                                  .addReg(PPC::X2)
                                  .addReg(PPC::X2)
@@ -1157,16 +1157,16 @@ void PPCLinuxAsmPrinter::EmitFunctionBodyStart() {
     MCSymbol *LocalEntryLabel = OutContext.createTempSymbol();
     OutStreamer->EmitLabel(LocalEntryLabel);
     const MCSymbolRefExpr *LocalEntryLabelExp =
-       MCSymbolRefExpr::Create(LocalEntryLabel, OutContext);
+       MCSymbolRefExpr::create(LocalEntryLabel, OutContext);
     const MCExpr *LocalOffsetExp =
-      MCBinaryExpr::CreateSub(LocalEntryLabelExp,
+      MCBinaryExpr::createSub(LocalEntryLabelExp,
                               GlobalEntryLabelExp, OutContext);
 
     PPCTargetStreamer *TS =
       static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
 
     if (TS)
-      TS->emitLocalEntry(CurrentFnSym, LocalOffsetExp);
+      TS->emitLocalEntry(cast<MCSymbolELF>(CurrentFnSym), LocalOffsetExp);
   }
 }
 
@@ -1305,10 +1305,10 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
       OutStreamer->EmitLabel(Stub);
       OutStreamer->EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
 
-      const MCExpr *Anon = MCSymbolRefExpr::Create(AnonSymbol, OutContext);
-      const MCExpr *LazyPtrExpr = MCSymbolRefExpr::Create(LazyPtr, OutContext);
+      const MCExpr *Anon = MCSymbolRefExpr::create(AnonSymbol, OutContext);
+      const MCExpr *LazyPtrExpr = MCSymbolRefExpr::create(LazyPtr, OutContext);
       const MCExpr *Sub =
-        MCBinaryExpr::CreateSub(LazyPtrExpr, Anon, OutContext);
+        MCBinaryExpr::createSub(LazyPtrExpr, Anon, OutContext);
 
       // mflr r0
       EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR).addReg(PPC::R0));
@@ -1318,7 +1318,7 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
       // mflr r11
       EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR).addReg(PPC::R11));
       // addis r11, r11, ha16(LazyPtr - AnonSymbol)
-      const MCExpr *SubHa16 = PPCMCExpr::CreateHa(Sub, true, OutContext);
+      const MCExpr *SubHa16 = PPCMCExpr::createHa(Sub, true, OutContext);
       EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
         .addReg(PPC::R11)
         .addReg(PPC::R11)
@@ -1328,7 +1328,7 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
 
       // ldu r12, lo16(LazyPtr - AnonSymbol)(r11)
       // lwzu r12, lo16(LazyPtr - AnonSymbol)(r11)
-      const MCExpr *SubLo16 = PPCMCExpr::CreateLo(Sub, true, OutContext);
+      const MCExpr *SubLo16 = PPCMCExpr::createLo(Sub, true, OutContext);
       EmitToStreamer(*OutStreamer, MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
         .addReg(PPC::R12)
         .addExpr(SubLo16).addExpr(SubLo16)
@@ -1364,7 +1364,7 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
     MCSymbol *Stub = Stubs[i].first;
     MCSymbol *RawSym = Stubs[i].second.getPointer();
     MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext);
-    const MCExpr *LazyPtrExpr = MCSymbolRefExpr::Create(LazyPtr, OutContext);
+    const MCExpr *LazyPtrExpr = MCSymbolRefExpr::create(LazyPtr, OutContext);
 
     OutStreamer->SwitchSection(StubSection);
     EmitAlignment(4);
@@ -1373,7 +1373,7 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
 
     // lis r11, ha16(LazyPtr)
     const MCExpr *LazyPtrHa16 =
-      PPCMCExpr::CreateHa(LazyPtrExpr, true, OutContext);
+      PPCMCExpr::createHa(LazyPtrExpr, true, OutContext);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LIS)
       .addReg(PPC::R11)
       .addExpr(LazyPtrHa16));
@@ -1381,7 +1381,7 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
     // ldu r12, lo16(LazyPtr)(r11)
     // lwzu r12, lo16(LazyPtr)(r11)
     const MCExpr *LazyPtrLo16 =
-      PPCMCExpr::CreateLo(LazyPtrExpr, true, OutContext);
+      PPCMCExpr::createLo(LazyPtrExpr, true, OutContext);
     EmitToStreamer(*OutStreamer, MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
       .addReg(PPC::R12)
       .addExpr(LazyPtrLo16).addExpr(LazyPtrLo16)
@@ -1465,7 +1465,7 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
         // need to be indirect and pc-rel. We accomplish this by using NLPs.
         // However, sometimes the types are local to the file. So we need to
         // fill in the value for the NLP in those cases.
-        OutStreamer->EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
+        OutStreamer->EmitValue(MCSymbolRefExpr::create(MCSym.getPointer(),
                                                        OutContext),
                               isPPC64 ? 8 : 4/*size*/);
     }
@@ -1484,7 +1484,7 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
       OutStreamer->EmitLabel(Stubs[i].first);
       //   .long _foo
       OutStreamer->EmitValue(MCSymbolRefExpr::
-                             Create(Stubs[i].second.getPointer(),
+                             create(Stubs[i].second.getPointer(),
                                     OutContext),
                              isPPC64 ? 8 : 4/*size*/);
     }
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index 0b8e23c4ebf8..a561d5b1190a 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -148,6 +148,9 @@ class PPCFastISel final : public FastISel {
     bool isVSFRCRegister(unsigned Register) const {
       return MRI.getRegClass(Register)->getID() == PPC::VSFRCRegClassID;
     }
+    bool isVSSRCRegister(unsigned Register) const {
+      return MRI.getRegClass(Register)->getID() == PPC::VSSRCRegClassID;
+    }
     bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value,
                     bool isZExt, unsigned DestReg);
     bool PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
@@ -503,8 +506,11 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
 
   // If this is a potential VSX load with an offset of 0, a VSX indexed load can
   // be used.
+  bool IsVSSRC = (ResultReg != 0) && isVSSRCRegister(ResultReg);
   bool IsVSFRC = (ResultReg != 0) && isVSFRCRegister(ResultReg);
-  if (IsVSFRC && (Opc == PPC::LFD) && 
+  bool Is32VSXLoad = IsVSSRC && Opc == PPC::LFS;
+  bool Is64VSXLoad = IsVSSRC && Opc == PPC::LFD;
+  if ((Is32VSXLoad || Is64VSXLoad) &&
       (Addr.BaseType != Address::FrameIndexBase) && UseOffset &&
       (Addr.Offset == 0)) {
     UseOffset = false;
@@ -518,7 +524,7 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
   // into a RegBase.
   if (Addr.BaseType == Address::FrameIndexBase) {
     // VSX only provides an indexed load.
-    if (IsVSFRC && Opc == PPC::LFD) return false;
+    if (Is32VSXLoad || Is64VSXLoad) return false;
 
     MachineMemOperand *MMO =
       FuncInfo.MF->getMachineMemOperand(
@@ -532,7 +538,7 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
   // Base reg with offset in range.
   } else if (UseOffset) {
     // VSX only provides an indexed load.
-    if (IsVSFRC && Opc == PPC::LFD) return false;
+    if (Is32VSXLoad || Is64VSXLoad) return false;
 
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addImm(Addr.Offset).addReg(Addr.Base.Reg);
@@ -555,7 +561,7 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
       case PPC::LWA:    Opc = PPC::LWAX;    break;
       case PPC::LWA_32: Opc = PPC::LWAX_32; break;
       case PPC::LD:     Opc = PPC::LDX;     break;
-      case PPC::LFS:    Opc = PPC::LFSX;    break;
+      case PPC::LFS:    Opc = IsVSSRC ? PPC::LXSSPX : PPC::LFSX; break;
       case PPC::LFD:    Opc = IsVSFRC ? PPC::LXSDX : PPC::LFDX; break;
     }
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
@@ -636,9 +642,12 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
 
   // If this is a potential VSX store with an offset of 0, a VSX indexed store
   // can be used.
+  bool IsVSSRC = isVSSRCRegister(SrcReg);
   bool IsVSFRC = isVSFRCRegister(SrcReg);
-  if (IsVSFRC && (Opc == PPC::STFD) && 
-      (Addr.BaseType != Address::FrameIndexBase) && UseOffset && 
+  bool Is32VSXStore = IsVSSRC && Opc == PPC::STFS;
+  bool Is64VSXStore = IsVSFRC && Opc == PPC::STFD;
+  if ((Is32VSXStore || Is64VSXStore) &&
+      (Addr.BaseType != Address::FrameIndexBase) && UseOffset &&
       (Addr.Offset == 0)) {
     UseOffset = false;
   }
@@ -648,7 +657,7 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
   // into a RegBase.
   if (Addr.BaseType == Address::FrameIndexBase) {
     // VSX only provides an indexed store.
-    if (IsVSFRC && Opc == PPC::STFD) return false;
+    if (Is32VSXStore || Is64VSXStore) return false;
 
     MachineMemOperand *MMO =
       FuncInfo.MF->getMachineMemOperand(
@@ -665,7 +674,7 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
   // Base reg with offset in range.
   } else if (UseOffset) {
     // VSX only provides an indexed store.
-    if (IsVSFRC && Opc == PPC::STFD) return false;
+    if (Is32VSXStore || Is64VSXStore) return false;
     
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
       .addReg(SrcReg).addImm(Addr.Offset).addReg(Addr.Base.Reg);
@@ -684,7 +693,7 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
       case PPC::STH8: Opc = PPC::STHX8; break;
       case PPC::STW8: Opc = PPC::STWX8; break;
       case PPC::STD:  Opc = PPC::STDX;  break;
-      case PPC::STFS: Opc = PPC::STFSX; break;
+      case PPC::STFS: Opc = IsVSSRC ? PPC::STXSSPX : PPC::STFSX; break;
       case PPC::STFD: Opc = IsVSFRC ? PPC::STXSDX : PPC::STFDX; break;
     }
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index bb9315e9520e..2600ee5db179 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -10825,7 +10825,8 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 // isLegalAddressingMode - Return true if the addressing mode represented
 // by AM is legal for this target, for a load/store of the specified type.
 bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              Type *Ty) const {
+                                              Type *Ty,
+                                              unsigned AS) const {
   // PPC does not allow r+i addressing modes for vectors!
   if (Ty->isVectorTy() && AM.BaseOffs != 0)
     return false;
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index c93de430fd05..7fd3f9c3de3d 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -556,7 +556,8 @@ namespace llvm {
 
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
-    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
+                               unsigned AS) const override;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
     /// icmp immediate, that is the target has icmp instructions which can
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 9685bac2aebb..d08b80871f3e 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1078,6 +1078,82 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                         (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
                         "xssubsp $XT, $XA, $XB", IIC_VecFP,
                         [(set f32:$XT, (fsub f32:$XA, f32:$XB))]>;
+
+  // FMA Instructions
+  let BaseName = "XSMADDASP" in {
+  let isCommutable = 1 in
+  def XSMADDASP : XX3Form<60, 1,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsmaddasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f32:$XT, (fma f32:$XA, f32:$XB, f32:$XTi))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSMADDMSP : XX3Form<60, 9,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XSMSUBASP" in {
+  let isCommutable = 1 in
+  def XSMSUBASP : XX3Form<60, 17,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsmsubasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f32:$XT, (fma f32:$XA, f32:$XB,
+                                              (fneg f32:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSMSUBMSP : XX3Form<60, 25,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XSNMADDASP" in {
+  let isCommutable = 1 in
+  def XSNMADDASP : XX3Form<60, 129,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsnmaddasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f32:$XT, (fneg (fma f32:$XA, f32:$XB,
+                                                    f32:$XTi)))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSNMADDMSP : XX3Form<60, 137,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
+
+  let BaseName = "XSNMSUBASP" in {
+  let isCommutable = 1 in
+  def XSNMSUBASP : XX3Form<60, 145,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsnmsubasp $XT, $XA, $XB", IIC_VecFP,
+                          [(set f32:$XT, (fneg (fma f32:$XA, f32:$XB,
+                                                    (fneg f32:$XTi))))]>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  let IsVSXFMAAlt = 1 in
+  def XSNMSUBMSP : XX3Form<60, 153,
+                          (outs vssrc:$XT),
+                          (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
+                          "xsnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>,
+                          RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
+                          AltVSXFMARel;
+  }
 } // AddedComplexity = 400
 } // HasP8Vector
 
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index f1e28651aea2..05cb6e11db67 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -142,28 +142,28 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
   if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB && !isDarwin)
     RefKind = MCSymbolRefExpr::VK_PLT;
 
-  const MCExpr *Expr = MCSymbolRefExpr::Create(Symbol, RefKind, Ctx);
+  const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, RefKind, Ctx);
 
   if (!MO.isJTI() && MO.getOffset())
-    Expr = MCBinaryExpr::CreateAdd(Expr,
-                                   MCConstantExpr::Create(MO.getOffset(), Ctx),
+    Expr = MCBinaryExpr::createAdd(Expr,
+                                   MCConstantExpr::create(MO.getOffset(), Ctx),
                                    Ctx);
 
   // Subtract off the PIC base if required.
   if (MO.getTargetFlags() & PPCII::MO_PIC_FLAG) {
     const MachineFunction *MF = MO.getParent()->getParent()->getParent();
     
-    const MCExpr *PB = MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
-    Expr = MCBinaryExpr::CreateSub(Expr, PB, Ctx);
+    const MCExpr *PB = MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
+    Expr = MCBinaryExpr::createSub(Expr, PB, Ctx);
   }
 
   // Add ha16() / lo16() markers if required.
   switch (access) {
     case PPCII::MO_LO:
-      Expr = PPCMCExpr::CreateLo(Expr, isDarwin, Ctx);
+      Expr = PPCMCExpr::createLo(Expr, isDarwin, Ctx);
       break;
     case PPCII::MO_HA:
-      Expr = PPCMCExpr::CreateHa(Expr, isDarwin, Ctx);
+      Expr = PPCMCExpr::createHa(Expr, isDarwin, Ctx);
       break;
   }
 
@@ -193,7 +193,7 @@ void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
       MCOp = MCOperand::createImm(MO.getImm());
       break;
     case MachineOperand::MO_MachineBasicBlock:
-      MCOp = MCOperand::createExpr(MCSymbolRefExpr::Create(
+      MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
                                       MO.getMBB()->getSymbol(), AP.OutContext));
       break;
     case MachineOperand::MO_GlobalAddress:
diff --git a/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/lib/Target/PowerPC/PPCTargetObjectFile.cpp
index 9ad134070082..9ee5db938b67 100644
--- a/lib/Target/PowerPC/PPCTargetObjectFile.cpp
+++ b/lib/Target/PowerPC/PPCTargetObjectFile.cpp
@@ -55,9 +55,9 @@ MCSection *PPC64LinuxTargetObjectFile::SelectSectionForGlobal(
 const MCExpr *PPC64LinuxTargetObjectFile::
 getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
   const MCExpr *Expr =
-    MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_PPC_DTPREL, getContext());
-  return MCBinaryExpr::CreateAdd(Expr,
-                                 MCConstantExpr::Create(0x8000, getContext()),
+    MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_PPC_DTPREL, getContext());
+  return MCBinaryExpr::createAdd(Expr,
+                                 MCConstantExpr::create(0x8000, getContext()),
                                  getContext());
 }
 
diff --git a/lib/Target/PowerPC/PPCTargetStreamer.h b/lib/Target/PowerPC/PPCTargetStreamer.h
index 8aaf5e188907..dbe7617d3542 100644
--- a/lib/Target/PowerPC/PPCTargetStreamer.h
+++ b/lib/Target/PowerPC/PPCTargetStreamer.h
@@ -20,7 +20,7 @@ public:
   virtual void emitTCEntry(const MCSymbol &S) = 0;
   virtual void emitMachine(StringRef CPU) = 0;
   virtual void emitAbiVersion(int AbiVersion) = 0;
-  virtual void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) = 0;
+  virtual void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) = 0;
 };
 }
 
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index 9b3606372035..0a05d25189b0 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -105,7 +105,7 @@ namespace ShaderType {
 /// a separate piece of memory that is unique from other
 /// memory locations.
 namespace AMDGPUAS {
-enum AddressSpaces {
+enum AddressSpaces : unsigned {
   PRIVATE_ADDRESS  = 0, ///< Address space for private memory.
   GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
   CONSTANT_ADDRESS = 2, ///< Address space for constant memory
@@ -137,7 +137,10 @@ enum AddressSpaces {
   CONSTANT_BUFFER_14 = 22,
   CONSTANT_BUFFER_15 = 23,
   ADDRESS_NONE = 24, ///< Address space for unknown memory.
-  LAST_ADDRESS = ADDRESS_NONE
+  LAST_ADDRESS = ADDRESS_NONE,
+
+  // Some places use this if the address space can't be determined.
+  UNKNOWN_ADDRESS_SPACE = ~0u
 };
 
 } // namespace AMDGPUAS
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index d00ae78c99b0..d56838ec2019 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -257,9 +257,22 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
+
   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
+
   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
+  setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
+  setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
+
   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
   for (MVT VT : ScalarIntVTs) {
@@ -301,6 +314,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
 
+  setOperationAction(ISD::SMIN, MVT::i32, Legal);
+  setOperationAction(ISD::UMIN, MVT::i32, Legal);
+  setOperationAction(ISD::SMAX, MVT::i32, Legal);
+  setOperationAction(ISD::UMAX, MVT::i32, Legal);
+
   if (!Subtarget->hasFFBH())
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
 
@@ -962,17 +980,17 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                    Op.getOperand(2));
 
     case AMDGPUIntrinsic::AMDGPU_imax:
-      return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
-                                                  Op.getOperand(2));
+      return DAG.getNode(ISD::SMAX, DL, VT, Op.getOperand(1),
+                                            Op.getOperand(2));
     case AMDGPUIntrinsic::AMDGPU_umax:
-      return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1),
-                                                  Op.getOperand(2));
+      return DAG.getNode(ISD::UMAX, DL, VT, Op.getOperand(1),
+                                            Op.getOperand(2));
     case AMDGPUIntrinsic::AMDGPU_imin:
-      return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1),
-                                                  Op.getOperand(2));
+      return DAG.getNode(ISD::SMIN, DL, VT, Op.getOperand(1),
+                                            Op.getOperand(2));
     case AMDGPUIntrinsic::AMDGPU_umin:
-      return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1),
-                                                  Op.getOperand(2));
+      return DAG.getNode(ISD::UMIN, DL, VT, Op.getOperand(1),
+                                            Op.getOperand(2));
 
     case AMDGPUIntrinsic::AMDGPU_umul24:
       return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT,
@@ -1050,7 +1068,7 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
   SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
                             Op.getOperand(1));
 
-  return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1));
+  return DAG.getNode(ISD::SMAX, DL, VT, Neg, Op.getOperand(1));
 }
 
 /// Linear Interpolation
@@ -1149,7 +1167,7 @@ SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
   return SDValue();
 }
 
-/// \brief Generate Min/Max node
+// FIXME: Remove this when combines added to DAGCombiner.
 SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL,
                                              EVT VT,
                                              SDValue LHS,
@@ -1165,22 +1183,22 @@ SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL,
   switch (CCOpcode) {
   case ISD::SETULE:
   case ISD::SETULT: {
-    unsigned Opc = (LHS == True) ? AMDGPUISD::UMIN : AMDGPUISD::UMAX;
+    unsigned Opc = (LHS == True) ? ISD::UMIN : ISD::UMAX;
     return DAG.getNode(Opc, DL, VT, LHS, RHS);
   }
   case ISD::SETLE:
   case ISD::SETLT: {
-    unsigned Opc = (LHS == True) ? AMDGPUISD::SMIN : AMDGPUISD::SMAX;
+    unsigned Opc = (LHS == True) ? ISD::SMIN : ISD::SMAX;
     return DAG.getNode(Opc, DL, VT, LHS, RHS);
   }
   case ISD::SETGT:
   case ISD::SETGE: {
-    unsigned Opc = (LHS == True) ? AMDGPUISD::SMAX : AMDGPUISD::SMIN;
+    unsigned Opc = (LHS == True) ? ISD::SMAX : ISD::SMIN;
     return DAG.getNode(Opc, DL, VT, LHS, RHS);
   }
   case ISD::SETUGE:
   case ISD::SETUGT: {
-    unsigned Opc = (LHS == True) ? AMDGPUISD::UMAX : AMDGPUISD::UMIN;
+    unsigned Opc = (LHS == True) ? ISD::UMAX : ISD::UMIN;
     return DAG.getNode(Opc, DL, VT, LHS, RHS);
   }
   default:
@@ -2644,11 +2662,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(COS_HW)
   NODE_NAME_CASE(SIN_HW)
   NODE_NAME_CASE(FMAX_LEGACY)
-  NODE_NAME_CASE(SMAX)
-  NODE_NAME_CASE(UMAX)
   NODE_NAME_CASE(FMIN_LEGACY)
-  NODE_NAME_CASE(SMIN)
-  NODE_NAME_CASE(UMIN)
   NODE_NAME_CASE(FMAX3)
   NODE_NAME_CASE(SMAX3)
   NODE_NAME_CASE(UMAX3)
@@ -2794,14 +2808,6 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
 
     break;
   }
-  case AMDGPUISD::SMAX:
-  case AMDGPUISD::UMAX:
-  case AMDGPUISD::SMIN:
-  case AMDGPUISD::UMIN:
-    computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
-                              KnownZero, KnownOne, DAG, Depth);
-    break;
-
   case AMDGPUISD::CARRY:
   case AMDGPUISD::BORROW: {
     KnownZero = APInt::getHighBitsSet(32, 31);
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index c9f198129efc..fbb7d3c88437 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -228,11 +228,7 @@ enum NodeType : unsigned {
   COS_HW,
   SIN_HW,
   FMAX_LEGACY,
-  SMAX,
-  UMAX,
   FMIN_LEGACY,
-  SMIN,
-  UMIN,
   FMAX3,
   SMAX3,
   UMAX3,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
index f0f10ca59723..64e295f1144c 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -152,17 +152,15 @@ bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const
   return true;
 }
 
-MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                                     MachineInstr *MI,
-                                                     ArrayRef<unsigned> Ops,
-                                                     int FrameIndex) const {
+MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
 // TODO: Implement this function
   return nullptr;
 }
-MachineInstr *
-AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                       ArrayRef<unsigned> Ops,
-                                       MachineInstr *LoadMI) const {
+MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const {
   // TODO: Implement this function
   return nullptr;
 }
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
index 07042b59be7b..8fd27a17638b 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
@@ -87,9 +87,11 @@ public:
 protected:
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
                                       ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
                                       int FrameIndex) const override;
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
                                       ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
                                       MachineInstr *LoadMI) const override;
 
 public:
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index 790f34cea8cd..b413897d9d23 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -94,16 +94,6 @@ def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp,
   []
 >;
 
-// out = min(a, b) a and b are signed ints
-def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp,
-  [SDNPCommutative, SDNPAssociative]
->;
-
-// out = min(a, b) a and b are unsigned ints
-def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp,
-  [SDNPCommutative, SDNPAssociative]
->;
-
 // FIXME: TableGen doesn't like commutative instructions with more
 // than 2 operands.
 // out = max(a, b, c) a, b and c are floats
diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
index 9565e3fd5fa6..20831460b933 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.cpp
+++ b/lib/Target/R600/AMDGPUMCInstLower.cpp
@@ -64,25 +64,25 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
       MCOp = MCOperand::createReg(MO.getReg());
       break;
     case MachineOperand::MO_MachineBasicBlock:
-      MCOp = MCOperand::createExpr(MCSymbolRefExpr::Create(
+      MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
                                    MO.getMBB()->getSymbol(), Ctx));
       break;
     case MachineOperand::MO_GlobalAddress: {
       const GlobalValue *GV = MO.getGlobal();
       MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(GV->getName()));
-      MCOp = MCOperand::createExpr(MCSymbolRefExpr::Create(Sym, Ctx));
+      MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx));
       break;
     }
     case MachineOperand::MO_TargetIndex: {
       assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START);
       MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
-      const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
       MCOp = MCOperand::createExpr(Expr);
       break;
     }
     case MachineOperand::MO_ExternalSymbol: {
       MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName()));
-      const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
       MCOp = MCOperand::createExpr(Expr);
       break;
     }
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index b262cdf57712..a5a901c739d4 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -272,7 +272,7 @@ public:
   }
 
   bool enableSubRegLiveness() const override {
-    return false;
+    return true;
   }
 };
 
diff --git a/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp
index 19bffd575117..95025a6e29f1 100644
--- a/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp
@@ -1084,7 +1084,7 @@ AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) {
 
     case AsmToken::Identifier:
       Operands.push_back(AMDGPUOperand::CreateExpr(
-          MCSymbolRefExpr::Create(getContext().getOrCreateSymbol(
+          MCSymbolRefExpr::create(getContext().getOrCreateSymbol(
                                   Parser.getTok().getString()), getContext()), S));
       Parser.Lex();
       return MatchOperand_Success;
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
index 279c3eb1912f..f70676943bb3 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -337,7 +337,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     }
   } else if (Op.isExpr()) {
     const MCExpr *Exp = Op.getExpr();
-    Exp->print(O);
+    Exp->print(O, &MAI);
   } else {
     llvm_unreachable("unknown operand type in printOperand");
   }
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
index 2605ca52dfde..3713223697ed 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -25,18 +25,18 @@ namespace {
 class AMDGPUMCObjectWriter : public MCObjectWriter {
 public:
   AMDGPUMCObjectWriter(raw_pwrite_stream &OS) : MCObjectWriter(OS, true) {}
-  void ExecutePostLayoutBinding(MCAssembler &Asm,
+  void executePostLayoutBinding(MCAssembler &Asm,
                                 const MCAsmLayout &Layout) override {
     //XXX: Implement if necessary.
   }
-  void RecordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
+  void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
                         const MCFragment *Fragment, const MCFixup &Fixup,
                         MCValue Target, bool &IsPCRel,
                         uint64_t &FixedValue) override {
     assert(!"Not implemented");
   }
 
-  void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
+  void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
 
 };
 
@@ -64,7 +64,7 @@ public:
 
 } //End anonymous namespace
 
-void AMDGPUMCObjectWriter::WriteObject(MCAssembler &Asm,
+void AMDGPUMCObjectWriter::writeObject(MCAssembler &Asm,
                                        const MCAsmLayout &Layout) {
   for (MCAssembler::iterator I = Asm.begin(), E = Asm.end(); I != E; ++I) {
     Asm.writeSectionData(&*I, Layout);
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 19d89fb27caa..028a86dfc7ad 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -11,7 +11,7 @@
 #include "AMDGPUMCAsmInfo.h"
 
 using namespace llvm;
-AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfoELF() {
+AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
   HasSingleParameterDotFile = false;
   //===------------------------------------------------------------------===//
   MaxInstLength = 16;
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
index 8f75c76c4257..a5bac51e356f 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -17,7 +17,7 @@
 #include "llvm/MC/MCAsmInfoELF.h"
 namespace llvm {
 
-class StringRef;
+class Triple;
 
 // If you need to create another MCAsmInfo class, which inherits from MCAsmInfo,
 // you will need to make sure your new class sets PrivateGlobalPrefix to
@@ -26,7 +26,7 @@ class StringRef;
 // with 'L' as a local symbol.
 class AMDGPUMCAsmInfo : public MCAsmInfoELF {
 public:
-  explicit AMDGPUMCAsmInfo(StringRef &TT);
+  explicit AMDGPUMCAsmInfo(const Triple &TT);
 };
 } // namespace llvm
 #endif
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index a809564e3be0..e683498d52a5 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -148,15 +149,11 @@ void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const {
 }
 
 void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
-  for (unsigned i = 0; i < 4; i++) {
-    OS.write((uint8_t) ((Value >> (8 * i)) & 0xff));
-  }
+  support::endian::Writer<support::little>(OS).write(Value);
 }
 
 void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const {
-  for (unsigned i = 0; i < 8; i++) {
-    EmitByte((Value >> (8 * i)) & 0xff, OS);
-  }
+  support::endian::Writer<support::little>(OS).write(Value);
 }
 
 unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const {
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 7126c82c0331..7beed092b3f7 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -781,10 +781,10 @@ def XOR_INT : R600_2OP_Helper <0x32, "XOR_INT", xor>;
 def NOT_INT : R600_1OP_Helper <0x33, "NOT_INT", not>;
 def ADD_INT : R600_2OP_Helper <0x34, "ADD_INT", add>;
 def SUB_INT : R600_2OP_Helper <0x35, "SUB_INT", sub>;
-def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", AMDGPUsmax>;
-def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", AMDGPUsmin>;
-def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", AMDGPUumax>;
-def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", AMDGPUumin>;
+def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", smax>;
+def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", smin>;
+def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", umax>;
+def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", umin>;
 
 def SETE_INT : R600_2OP <
   0x3A, "SETE_INT",
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 52bf2aeb87de..12d08cf4c7f5 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -155,7 +155,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   for (MVT VT : MVT::fp_valuetypes())
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
 
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
@@ -211,6 +210,10 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FMINNUM);
   setTargetDAGCombine(ISD::FMAXNUM);
+  setTargetDAGCombine(ISD::SMIN);
+  setTargetDAGCombine(ISD::SMAX);
+  setTargetDAGCombine(ISD::UMIN);
+  setTargetDAGCombine(ISD::UMAX);
   setTargetDAGCombine(ISD::SELECT_CC);
   setTargetDAGCombine(ISD::SETCC);
   setTargetDAGCombine(ISD::AND);
@@ -251,47 +254,83 @@ bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
   return false;
 }
 
-// FIXME: This really needs an address space argument. The immediate offset
-// size is different for different sets of memory instruction sets.
-
-// The single offset DS instructions have a 16-bit unsigned byte offset.
-//
-// MUBUF / MTBUF have a 12-bit unsigned byte offset, and additionally can do r +
-// r + i with addr64. 32-bit has more addressing mode options. Depending on the
-// resource constant, it can also do (i64 r0) + (i32 r1) * (i14 i).
-//
-// SMRD instructions have an 8-bit, dword offset.
-//
 bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                             Type *Ty) const {
+                                             Type *Ty, unsigned AS) const {
   // No global is ever allowed as a base.
   if (AM.BaseGV)
     return false;
 
-  // Allow a 16-bit unsigned immediate field, since this is what DS instructions
-  // use.
-  if (!isUInt<16>(AM.BaseOffs))
-    return false;
+  switch (AS) {
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
+  case AMDGPUAS::PRIVATE_ADDRESS:
+  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: {
+    // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
+    // additionally can do r + r + i with addr64. 32-bit has more addressing
+    // mode options. Depending on the resource constant, it can also do
+    // (i64 r0) + (i32 r1) * (i14 i).
+    //
+    // SMRD instructions have an 8-bit, dword offset.
+    //
+    // Assume nonunifom access, since the address space isn't enough to know
+    // what instruction we will use, and since we don't know if this is a load
+    // or store and scalar stores are only available on VI.
+    //
+    // We also know if we are doing an extload, we can't do a scalar load.
+    //
+    // Private arrays end up using a scratch buffer most of the time, so also
+    // assume those use MUBUF instructions. Scratch loads / stores are currently
+    // implemented as mubuf instructions with offen bit set, so slightly
+    // different than the normal addr64.
+    if (!isUInt<12>(AM.BaseOffs))
+      return false;
 
-  // Only support r+r,
-  switch (AM.Scale) {
-  case 0:  // "r+i" or just "i", depending on HasBaseReg.
-    break;
-  case 1:
-    if (AM.HasBaseReg && AM.BaseOffs)  // "r+r+i" is not allowed.
+    // FIXME: Since we can split immediate into soffset and immediate offset,
+    // would it make sense to allow any immediate?
+
+    switch (AM.Scale) {
+    case 0: // r + i or just i, depending on HasBaseReg.
+      return true;
+    case 1:
+      return true; // We have r + r or r + i.
+    case 2:
+      if (AM.HasBaseReg) {
+        // Reject 2 * r + r.
+        return false;
+      }
+
+      // Allow 2 * r as r + r
+      // Or  2 * r + i is allowed as r + r + i.
+      return true;
+    default: // Don't allow n * r
       return false;
-    // Otherwise we have r+r or r+i.
-    break;
-  case 2:
-    if (AM.HasBaseReg || AM.BaseOffs)  // 2*r+r  or  2*r+i is not allowed.
+    }
+  }
+  case AMDGPUAS::LOCAL_ADDRESS:
+  case AMDGPUAS::REGION_ADDRESS: {
+    // Basic, single offset DS instructions allow a 16-bit unsigned immediate
+    // field.
+    // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
+    // an 8-bit dword offset but we don't know the alignment here.
+    if (!isUInt<16>(AM.BaseOffs))
       return false;
-    // Allow 2*r as r+r.
-    break;
-  default: // Don't allow n * r
+
+    if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
+      return true;
+
+    if (AM.Scale == 1 && AM.HasBaseReg)
+      return true;
+
     return false;
   }
-
-  return true;
+  case AMDGPUAS::FLAT_ADDRESS: {
+    // Flat instructions do not have offsets, and only have the register
+    // address.
+    return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
+  }
+  default:
+    llvm_unreachable("unhandled address space");
+  }
 }
 
 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
@@ -368,6 +407,12 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   return TII->isInlineConstant(Imm);
 }
 
+static EVT toIntegerVT(EVT VT) {
+  if (VT.isVector())
+    return VT.changeVectorElementTypeToInteger();
+  return MVT::getIntegerVT(VT.getSizeInBits());
+}
+
 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                          SDLoc SL, SDValue Chain,
                                          unsigned Offset, bool Signed) const {
@@ -380,20 +425,42 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
 
   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+  MVT PtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
   PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
-  SDValue BasePtr =  DAG.getCopyFromReg(Chain, SL,
-                           MRI.getLiveInVirtReg(InputPtrReg), MVT::i64);
-  SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, BasePtr,
-                            DAG.getConstant(Offset, SL, MVT::i64));
+  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
+                                       MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
+  SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+                            DAG.getConstant(Offset, SL, PtrVT));
   SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
 
-  return DAG.getLoad(ISD::UNINDEXED, Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD,
+  unsigned Align = DL->getABITypeAlignment(Ty);
+
+  if (VT != MemVT && VT.isFloatingPoint()) {
+    // Do an integer load and convert.
+    // FIXME: This is mostly because load legalization after type legalization
+    // doesn't handle FP extloads.
+    assert(VT.getScalarType() == MVT::f32 &&
+           MemVT.getScalarType() == MVT::f16);
+
+    EVT IVT = toIntegerVT(VT);
+    EVT MemIVT = toIntegerVT(MemVT);
+    SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD,
+                               IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT,
+                               false, // isVolatile
+                               true, // isNonTemporal
+                               true, // isInvariant
+                               Align); // Alignment
+    return DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load);
+  }
+
+  ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+  return DAG.getLoad(ISD::UNINDEXED, ExtTy,
                      VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
                      false, // isVolatile
                      true, // isNonTemporal
                      true, // isInvariant
-                     DL->getABITypeAlignment(Ty)); // Alignment
+                     Align); // Alignment
 }
 
 SDValue SITargetLowering::LowerFormalArguments(
@@ -1570,15 +1637,15 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
   switch (Opc) {
   case ISD::FMAXNUM:
     return AMDGPUISD::FMAX3;
-  case AMDGPUISD::SMAX:
+  case ISD::SMAX:
     return AMDGPUISD::SMAX3;
-  case AMDGPUISD::UMAX:
+  case ISD::UMAX:
     return AMDGPUISD::UMAX3;
   case ISD::FMINNUM:
     return AMDGPUISD::FMIN3;
-  case AMDGPUISD::SMIN:
+  case ISD::SMIN:
     return AMDGPUISD::SMIN3;
-  case AMDGPUISD::UMIN:
+  case ISD::UMIN:
     return AMDGPUISD::UMIN3;
   default:
     llvm_unreachable("Not a min/max opcode");
@@ -1664,10 +1731,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performSetCCCombine(N, DCI);
   case ISD::FMAXNUM: // TODO: What about fmax_legacy?
   case ISD::FMINNUM:
-  case AMDGPUISD::SMAX:
-  case AMDGPUISD::SMIN:
-  case AMDGPUISD::UMAX:
-  case AMDGPUISD::UMIN: {
+  case ISD::SMAX:
+  case ISD::SMIN:
+  case ISD::UMAX:
+  case ISD::UMIN: {
     if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
         N->getValueType(0) != MVT::f64 &&
         getTargetMachine().getOptLevel() > CodeGenOpt::None)
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index a95354c38816..a956b013bdb1 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -63,7 +63,7 @@ public:
                           EVT /*VT*/) const override;
 
   bool isLegalAddressingMode(const AddrMode &AM,
-                             Type *Ty) const override;
+                             Type *Ty, unsigned AS) const override;
 
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
                                       unsigned Align,
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 839c2e9ecdd2..2f39074802b7 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -224,16 +224,16 @@ defm S_SUBB_U32 : SOP2_32 <sop2<0x05>, "s_subb_u32",
 } // End Uses = [SCC]
 
 defm S_MIN_I32 : SOP2_32 <sop2<0x06>, "s_min_i32",
-  [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]
+  [(set i32:$dst, (smin i32:$src0, i32:$src1))]
 >;
 defm S_MIN_U32 : SOP2_32 <sop2<0x07>, "s_min_u32",
-  [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]
+  [(set i32:$dst, (umin i32:$src0, i32:$src1))]
 >;
 defm S_MAX_I32 : SOP2_32 <sop2<0x08>, "s_max_i32",
-  [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]
+  [(set i32:$dst, (smax i32:$src0, i32:$src1))]
 >;
 defm S_MAX_U32 : SOP2_32 <sop2<0x09>, "s_max_u32",
-  [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]
+  [(set i32:$dst, (umax i32:$src0, i32:$src1))]
 >;
 } // End Defs = [SCC]
 
diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 388cb65c99cb..6b3b51afb4bd 100644
--- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -408,7 +408,7 @@ void SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc,
   uint64_t ImmValue = IsImm ? MCValOp.getImm() : 0;
   const MCExpr *ValExpr;
   if (IsImm)
-    ValExpr = MCConstantExpr::Create(ImmValue, getContext());
+    ValExpr = MCConstantExpr::create(ImmValue, getContext());
   else
     ValExpr = MCValOp.getExpr();
 
@@ -417,7 +417,7 @@ void SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc,
   if (!IsImm || (ImmValue & ~0x1fff)) {
     MCInst TmpInst;
     const MCExpr *Expr =
-        SparcMCExpr::Create(SparcMCExpr::VK_Sparc_HI, ValExpr, getContext());
+        SparcMCExpr::create(SparcMCExpr::VK_Sparc_HI, ValExpr, getContext());
     TmpInst.setLoc(IDLoc);
     TmpInst.setOpcode(SP::SETHIi);
     TmpInst.addOperand(MCRegOp);
@@ -429,7 +429,7 @@ void SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc,
   if (!IsImm || ((ImmValue & 0x1fff) != 0 || ImmValue == 0)) {
     MCInst TmpInst;
     const MCExpr *Expr =
-        SparcMCExpr::Create(SparcMCExpr::VK_Sparc_LO, ValExpr, getContext());
+        SparcMCExpr::create(SparcMCExpr::VK_Sparc_LO, ValExpr, getContext());
     TmpInst.setLoc(IDLoc);
     TmpInst.setOpcode(SP::ORri);
     TmpInst.addOperand(MCRegOp);
@@ -774,11 +774,11 @@ SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
       E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
       MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
 
-      const MCExpr *Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
+      const MCExpr *Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None,
                                                   getContext());
       if (isCall &&
           getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_)
-        Res = SparcMCExpr::Create(SparcMCExpr::VK_Sparc_WPLT30, Res,
+        Res = SparcMCExpr::create(SparcMCExpr::VK_Sparc_WPLT30, Res,
                                   getContext());
       Op = SparcOperand::CreateImm(Res, S, E);
     }
@@ -1010,7 +1010,7 @@ bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
     break;
   }
 
-  EVal = SparcMCExpr::Create(VK, subExpr, getContext());
+  EVal = SparcMCExpr::create(VK, subExpr, getContext());
   return true;
 }
 
diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
index bac2617b0f3e..5d714fe4da92 100644
--- a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
+++ b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
@@ -120,7 +120,7 @@ void SparcInstPrinter::printOperand(const MCInst *MI, int opNum,
   }
 
   assert(MO.isExpr() && "Unknown operand kind in printOperand");
-  MO.getExpr()->print(O);
+  MO.getExpr()->print(O, &MAI);
 }
 
 void SparcInstPrinter::printMemOperand(const MCInst *MI, int opNum,
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index 3792a596a6b8..9388527004f5 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -260,7 +260,7 @@ namespace {
 
       uint64_t NumNops = Count / 4;
       for (uint64_t i = 0; i != NumNops; ++i)
-        OW->Write32(0x01000000);
+        OW->write32(0x01000000);
 
       return true;
     }
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 124cb3b4b98b..280c6d7937b2 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -20,8 +20,7 @@ using namespace llvm;
 
 void SparcELFMCAsmInfo::anchor() {}
 
-SparcELFMCAsmInfo::SparcELFMCAsmInfo(StringRef TT) {
-  Triple TheTriple(TT);
+SparcELFMCAsmInfo::SparcELFMCAsmInfo(const Triple &TheTriple) {
   bool isV9 = (TheTriple.getArch() == Triple::sparcv9);
   IsLittleEndian = (TheTriple.getArch() == Triple::sparcel);
 
@@ -51,8 +50,8 @@ SparcELFMCAsmInfo::getExprForPersonalitySymbol(const MCSymbol *Sym,
                                                MCStreamer &Streamer) const {
   if (Encoding & dwarf::DW_EH_PE_pcrel) {
     MCContext &Ctx = Streamer.getContext();
-    return SparcMCExpr::Create(SparcMCExpr::VK_Sparc_R_DISP32,
-                               MCSymbolRefExpr::Create(Sym, Ctx), Ctx);
+    return SparcMCExpr::create(SparcMCExpr::VK_Sparc_R_DISP32,
+                               MCSymbolRefExpr::create(Sym, Ctx), Ctx);
   }
 
   return MCAsmInfo::getExprForPersonalitySymbol(Sym, Encoding, Streamer);
@@ -64,8 +63,8 @@ SparcELFMCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
                                        MCStreamer &Streamer) const {
   if (Encoding & dwarf::DW_EH_PE_pcrel) {
     MCContext &Ctx = Streamer.getContext();
-    return SparcMCExpr::Create(SparcMCExpr::VK_Sparc_R_DISP32,
-                               MCSymbolRefExpr::Create(Sym, Ctx), Ctx);
+    return SparcMCExpr::create(SparcMCExpr::VK_Sparc_R_DISP32,
+                               MCSymbolRefExpr::create(Sym, Ctx), Ctx);
   }
   return MCAsmInfo::getExprForFDESymbol(Sym, Encoding, Streamer);
 }
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index 84de55145b65..12386f14443e 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -17,12 +17,12 @@
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
-class StringRef;
+class Triple;
 
 class SparcELFMCAsmInfo : public MCAsmInfoELF {
   void anchor() override;
 public:
-  explicit SparcELFMCAsmInfo(StringRef TT);
+  explicit SparcELFMCAsmInfo(const Triple &TheTriple);
   const MCExpr*
   getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
                               MCStreamer &Streamer) const override;
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 34079eea7885..9171d4dc9c00 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -86,16 +86,10 @@ void SparcMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
 
   if (Ctx.getAsmInfo()->isLittleEndian()) {
     // Output the bits in little-endian byte order.
-    for (unsigned i = 0; i != 4; ++i) {
-      OS << (char)Bits;
-      Bits >>= 8;
-    }
+    support::endian::Writer<support::little>(OS).write<uint32_t>(Bits);
   } else {
     // Output the bits in big-endian byte order.
-    for (unsigned i = 0; i != 4; ++i) {
-      OS << (char)(Bits >> 24);
-      Bits <<= 8;
-    }
+    support::endian::Writer<support::big>(OS).write<uint32_t>(Bits);
   }
   unsigned tlsOpNo = 0;
   switch (MI.getOpcode()) {
@@ -137,7 +131,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   }
 
   int64_t Res;
-  if (Expr->EvaluateAsAbsolute(Res))
+  if (Expr->evaluateAsAbsolute(Res))
     return Res;
 
   llvm_unreachable("Unhandled expression!");
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index d97e3a25c5a7..e85a8cd5e339 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -15,9 +15,8 @@
 #include "SparcMCExpr.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCELF.h"
 #include "llvm/MC/MCObjectStreamer.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/Object/ELF.h"
 
 
@@ -26,20 +25,17 @@ using namespace llvm;
 #define DEBUG_TYPE "sparcmcexpr"
 
 const SparcMCExpr*
-SparcMCExpr::Create(VariantKind Kind, const MCExpr *Expr,
+SparcMCExpr::create(VariantKind Kind, const MCExpr *Expr,
                       MCContext &Ctx) {
     return new (Ctx) SparcMCExpr(Kind, Expr);
 }
 
-
-
-void SparcMCExpr::PrintImpl(raw_ostream &OS) const
-{
+void SparcMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
 
   bool closeParen = printVariantKind(OS, Kind);
 
   const MCExpr *Expr = getSubExpr();
-  Expr->print(OS);
+  Expr->print(OS, MAI);
 
   if (closeParen)
     OS << ')';
@@ -160,10 +156,10 @@ Sparc::Fixups SparcMCExpr::getFixupKind(SparcMCExpr::VariantKind Kind) {
 }
 
 bool
-SparcMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+SparcMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
                                        const MCAsmLayout *Layout,
                                        const MCFixup *Fixup) const {
-  return getSubExpr()->EvaluateAsRelocatable(Res, Layout, Fixup);
+  return getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup);
 }
 
 static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
@@ -184,8 +180,7 @@ static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
 
   case MCExpr::SymbolRef: {
     const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
-    MCSymbolData &SD = Asm.getOrCreateSymbolData(SymRef.getSymbol());
-    MCELF::SetType(SD, ELF::STT_TLS);
+    cast<MCSymbolELF>(SymRef.getSymbol()).setType(ELF::STT_TLS);
     break;
   }
 
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index 116e10406a7c..d08ad86dbe04 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -69,7 +69,7 @@ public:
   /// @name Construction
   /// @{
 
-  static const SparcMCExpr *Create(VariantKind Kind, const MCExpr *Expr,
+  static const SparcMCExpr *create(VariantKind Kind, const MCExpr *Expr,
                                  MCContext &Ctx);
   /// @}
   /// @name Accessors
@@ -85,13 +85,13 @@ public:
   Sparc::Fixups getFixupKind() const { return getFixupKind(Kind); }
 
   /// @}
-  void PrintImpl(raw_ostream &OS) const override;
-  bool EvaluateAsRelocatableImpl(MCValue &Res,
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAsmLayout *Layout,
                                  const MCFixup *Fixup) const override;
   void visitUsedExpr(MCStreamer &Streamer) const override;
-  MCSection *FindAssociatedSection() const override {
-    return getSubExpr()->FindAssociatedSection();
+  MCSection *findAssociatedSection() const override {
+    return getSubExpr()->findAssociatedSection();
   }
 
   void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index 4d5672e29550..d34c87977168 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -34,7 +34,7 @@ using namespace llvm;
 #include "SparcGenRegisterInfo.inc"
 
 static MCAsmInfo *createSparcMCAsmInfo(const MCRegisterInfo &MRI,
-                                       StringRef TT) {
+                                       const Triple &TT) {
   MCAsmInfo *MAI = new SparcELFMCAsmInfo(TT);
   unsigned Reg = MRI.getDwarfRegNum(SP::O6, true);
   MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
@@ -43,7 +43,7 @@ static MCAsmInfo *createSparcMCAsmInfo(const MCRegisterInfo &MRI,
 }
 
 static MCAsmInfo *createSparcV9MCAsmInfo(const MCRegisterInfo &MRI,
-                                       StringRef TT) {
+                                         const Triple &TT) {
   MCAsmInfo *MAI = new SparcELFMCAsmInfo(TT);
   unsigned Reg = MRI.getDwarfRegNum(SP::O6, true);
   MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 2047);
diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp
index 9903bc5799da..c5f046bfc5bb 100644
--- a/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -78,9 +78,9 @@ namespace {
 
 static MCOperand createSparcMCOperand(SparcMCExpr::VariantKind Kind,
                                       MCSymbol *Sym, MCContext &OutContext) {
-  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::Create(Sym,
+  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Sym,
                                                          OutContext);
-  const SparcMCExpr *expr = SparcMCExpr::Create(Kind, MCSym, OutContext);
+  const SparcMCExpr *expr = SparcMCExpr::create(Kind, MCSym, OutContext);
   return MCOperand::createExpr(expr);
 
 }
@@ -94,15 +94,15 @@ static MCOperand createPCXRelExprOp(SparcMCExpr::VariantKind Kind,
                                     MCSymbol *CurLabel,
                                     MCContext &OutContext)
 {
-  const MCSymbolRefExpr *GOT = MCSymbolRefExpr::Create(GOTLabel, OutContext);
-  const MCSymbolRefExpr *Start = MCSymbolRefExpr::Create(StartLabel,
+  const MCSymbolRefExpr *GOT = MCSymbolRefExpr::create(GOTLabel, OutContext);
+  const MCSymbolRefExpr *Start = MCSymbolRefExpr::create(StartLabel,
                                                          OutContext);
-  const MCSymbolRefExpr *Cur = MCSymbolRefExpr::Create(CurLabel,
+  const MCSymbolRefExpr *Cur = MCSymbolRefExpr::create(CurLabel,
                                                        OutContext);
 
-  const MCBinaryExpr *Sub = MCBinaryExpr::CreateSub(Cur, Start, OutContext);
-  const MCBinaryExpr *Add = MCBinaryExpr::CreateAdd(GOT, Sub, OutContext);
-  const SparcMCExpr *expr = SparcMCExpr::Create(Kind,
+  const MCBinaryExpr *Sub = MCBinaryExpr::createSub(Cur, Start, OutContext);
+  const MCBinaryExpr *Add = MCBinaryExpr::createAdd(GOT, Sub, OutContext);
+  const SparcMCExpr *expr = SparcMCExpr::create(Kind,
                                                 Add, OutContext);
   return MCOperand::createExpr(expr);
 }
@@ -199,7 +199,7 @@ void SparcAsmPrinter::LowerGETPCXAndEmitMCInsts(const MachineInstr *MI,
       EmitHiLo(*OutStreamer, GOTLabel,
                SparcMCExpr::VK_Sparc_H44, SparcMCExpr::VK_Sparc_M44,
                MCRegOP, OutContext, STI);
-      MCOperand imm = MCOperand::createExpr(MCConstantExpr::Create(12,
+      MCOperand imm = MCOperand::createExpr(MCConstantExpr::create(12,
                                                                    OutContext));
       EmitSHL(*OutStreamer, MCRegOP, imm, MCRegOP, STI);
       MCOperand lo = createSparcMCOperand(SparcMCExpr::VK_Sparc_L44,
@@ -211,7 +211,7 @@ void SparcAsmPrinter::LowerGETPCXAndEmitMCInsts(const MachineInstr *MI,
       EmitHiLo(*OutStreamer, GOTLabel,
                SparcMCExpr::VK_Sparc_HH, SparcMCExpr::VK_Sparc_HM,
                MCRegOP, OutContext, STI);
-      MCOperand imm = MCOperand::createExpr(MCConstantExpr::Create(32,
+      MCOperand imm = MCOperand::createExpr(MCConstantExpr::create(32,
                                                                    OutContext));
       EmitSHL(*OutStreamer, MCRegOP, imm, MCRegOP, STI);
       // Use register %o7 to load the lower 32 bits.
@@ -361,10 +361,10 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
     O << (int)MO.getImm();
     break;
   case MachineOperand::MO_MachineBasicBlock:
-    O << *MO.getMBB()->getSymbol();
+    MO.getMBB()->getSymbol()->print(O, MAI);
     return;
   case MachineOperand::MO_GlobalAddress:
-    O << *getSymbol(MO.getGlobal());
+    getSymbol(MO.getGlobal())->print(O, MAI);
     break;
   case MachineOperand::MO_BlockAddress:
     O <<  GetBlockAddressSymbol(MO.getBlockAddress())->getName();
diff --git a/lib/Target/Sparc/SparcMCInstLower.cpp b/lib/Target/Sparc/SparcMCInstLower.cpp
index 9388d594973c..b084d0021ba0 100644
--- a/lib/Target/Sparc/SparcMCInstLower.cpp
+++ b/lib/Target/Sparc/SparcMCInstLower.cpp
@@ -59,9 +59,9 @@ static MCOperand LowerSymbolOperand(const MachineInstr *MI,
     break;
   }
 
-  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::Create(Symbol,
+  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Symbol,
                                                          AP.OutContext);
-  const SparcMCExpr *expr = SparcMCExpr::Create(Kind, MCSym,
+  const SparcMCExpr *expr = SparcMCExpr::create(Kind, MCSym,
                                                 AP.OutContext);
   return MCOperand::createExpr(expr);
 }
diff --git a/lib/Target/Sparc/SparcTargetObjectFile.cpp b/lib/Target/Sparc/SparcTargetObjectFile.cpp
index 32b2240f87ea..412e124f9a26 100644
--- a/lib/Target/Sparc/SparcTargetObjectFile.cpp
+++ b/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -34,8 +34,8 @@ const MCExpr *SparcELFTargetObjectFile::getTTypeGlobalReference(
     }
 
     MCContext &Ctx = getContext();
-    return SparcMCExpr::Create(SparcMCExpr::VK_Sparc_R_DISP32,
-                               MCSymbolRefExpr::Create(SSym, Ctx), Ctx);
+    return SparcMCExpr::create(SparcMCExpr::VK_Sparc_R_DISP32,
+                               MCSymbolRefExpr::create(SSym, Ctx), Ctx);
   }
 
   return TargetLoweringObjectFileELF::getTTypeGlobalReference(
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index b721def54e12..3aa4c6bd32d6 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -865,9 +865,9 @@ SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
     }
     MCSymbol *Sym = Ctx.createTempSymbol();
     Out.EmitLabel(Sym);
-    const MCExpr *Base = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
+    const MCExpr *Base = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None,
                                                  Ctx);
-    Expr = Value == 0 ? Base : MCBinaryExpr::CreateAdd(Base, Expr, Ctx);
+    Expr = Value == 0 ? Base : MCBinaryExpr::createAdd(Base, Expr, Ctx);
   }
 
   // Optionally match :tls_gdcall: or :tls_ldcall: followed by a TLS symbol.
@@ -904,7 +904,7 @@ SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
     }
 
     StringRef Identifier = Parser.getTok().getString();
-    Sym = MCSymbolRefExpr::Create(Ctx.getOrCreateSymbol(Identifier),
+    Sym = MCSymbolRefExpr::create(Ctx.getOrCreateSymbol(Identifier),
                                   Kind, Ctx);
     Parser.Lex();
   }
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
index 373ddfa7e257..059ae3f7fb09 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
@@ -37,13 +37,14 @@ void SystemZInstPrinter::printAddress(unsigned Base, int64_t Disp,
   }
 }
 
-void SystemZInstPrinter::printOperand(const MCOperand &MO, raw_ostream &O) {
+void SystemZInstPrinter::printOperand(const MCOperand &MO, const MCAsmInfo *MAI,
+                                      raw_ostream &O) {
   if (MO.isReg())
     O << '%' << getRegisterName(MO.getReg());
   else if (MO.isImm())
     O << MO.getImm();
   else if (MO.isExpr())
-    O << *MO.getExpr();
+    MO.getExpr()->print(O, MAI);
   else
     llvm_unreachable("Invalid operand");
 }
@@ -147,7 +148,7 @@ void SystemZInstPrinter::printPCRelOperand(const MCInst *MI, int OpNum,
     O << "0x";
     O.write_hex(MO.getImm());
   } else
-    O << *MO.getExpr();
+    MO.getExpr()->print(O, &MAI);
 }
 
 void SystemZInstPrinter::printPCRelTLSOperand(const MCInst *MI, int OpNum,
@@ -175,7 +176,7 @@ void SystemZInstPrinter::printPCRelTLSOperand(const MCInst *MI, int OpNum,
 
 void SystemZInstPrinter::printOperand(const MCInst *MI, int OpNum,
                                       raw_ostream &O) {
-  printOperand(MI->getOperand(OpNum), O);
+  printOperand(MI->getOperand(OpNum), &MAI, O);
 }
 
 void SystemZInstPrinter::printBDAddrOperand(const MCInst *MI, int OpNum,
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
index 847b6962e6f2..ba55e686f3ef 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
@@ -35,7 +35,8 @@ public:
                            raw_ostream &O);
 
   // Print the given operand.
-  static void printOperand(const MCOperand &MO, raw_ostream &O);
+  static void printOperand(const MCOperand &MO, const MCAsmInfo *MAI,
+                           raw_ostream &O);
 
   // Override MCInstPrinter.
   void printRegName(raw_ostream &O, unsigned RegNo) const override;
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index 1c3887ab5456..0e8a680d4dd4 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -105,7 +105,7 @@ void SystemZMCAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 bool SystemZMCAsmBackend::writeNopData(uint64_t Count,
                                        MCObjectWriter *OW) const {
   for (uint64_t I = 0; I != Count; ++I)
-    OW->Write8(7);
+    OW->write8(7);
   return true;
 }
 
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index 0161d6263e7d..b17977d41be1 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -13,7 +13,7 @@
 
 using namespace llvm;
 
-SystemZMCAsmInfo::SystemZMCAsmInfo(StringRef TT) {
+SystemZMCAsmInfo::SystemZMCAsmInfo(const Triple &TT) {
   PointerSize = 8;
   CalleeSaveStackSlotSize = 8;
   IsLittleEndian = false;
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
index 19b5b4b09724..800f89232063 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
@@ -14,11 +14,11 @@
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
-class StringRef;
+class Triple;
 
 class SystemZMCAsmInfo : public MCAsmInfoELF {
 public:
-  explicit SystemZMCAsmInfo(StringRef TT);
+  explicit SystemZMCAsmInfo(const Triple &TT);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index c9290c1922d3..fd52a2ebf2fd 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -217,7 +217,7 @@ SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum,
   const MCOperand &MO = MI.getOperand(OpNum);
   const MCExpr *Expr;
   if (MO.isImm())
-    Expr = MCConstantExpr::Create(MO.getImm() + Offset, Ctx);
+    Expr = MCConstantExpr::create(MO.getImm() + Offset, Ctx);
   else {
     Expr = MO.getExpr();
     if (Offset) {
@@ -225,8 +225,8 @@ SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum,
       // is relative to the operand field itself, which is Offset bytes
       // into MI.  Add Offset to the relocation value to cancel out
       // this difference.
-      const MCExpr *OffsetExpr = MCConstantExpr::Create(Offset, Ctx);
-      Expr = MCBinaryExpr::CreateAdd(Expr, OffsetExpr, Ctx);
+      const MCExpr *OffsetExpr = MCConstantExpr::create(Offset, Ctx);
+      Expr = MCBinaryExpr::createAdd(Expr, OffsetExpr, Ctx);
     }
   }
   Fixups.push_back(MCFixup::create(Offset, Expr, (MCFixupKind)Kind));
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 8c2075afe505..92681cf6e44b 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -132,7 +132,7 @@ unsigned SystemZMC::getFirstReg(unsigned Reg) {
 }
 
 static MCAsmInfo *createSystemZMCAsmInfo(const MCRegisterInfo &MRI,
-                                         StringRef TT) {
+                                         const Triple &TT) {
   MCAsmInfo *MAI = new SystemZMCAsmInfo(TT);
   MCCFIInstruction Inst =
       MCCFIInstruction::createDefCfa(nullptr,
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index a0d079fcc359..3dca7bd89f05 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -68,14 +68,14 @@ static MCInst lowerRIEfLow(const MachineInstr *MI, unsigned Opcode) {
 
 static const MCSymbolRefExpr *getTLSGetOffset(MCContext &Context) {
   StringRef Name = "__tls_get_offset";
-  return MCSymbolRefExpr::Create(Context.getOrCreateSymbol(Name),
+  return MCSymbolRefExpr::create(Context.getOrCreateSymbol(Name),
                                  MCSymbolRefExpr::VK_PLT,
                                  Context);
 }
 
 static const MCSymbolRefExpr *getGlobalOffsetTable(MCContext &Context) {
   StringRef Name = "_GLOBAL_OFFSET_TABLE_";
-  return MCSymbolRefExpr::Create(Context.getOrCreateSymbol(Name),
+  return MCSymbolRefExpr::create(Context.getOrCreateSymbol(Name),
                                  MCSymbolRefExpr::VK_None,
                                  Context);
 }
@@ -285,7 +285,7 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
   auto *ZCPV = static_cast<SystemZConstantPoolValue*>(MCPV);
 
   const MCExpr *Expr =
-    MCSymbolRefExpr::Create(getSymbol(ZCPV->getGlobalValue()),
+    MCSymbolRefExpr::create(getSymbol(ZCPV->getGlobalValue()),
                             getModifierVariantKind(ZCPV->getModifier()),
                             OutContext);
   uint64_t Size = TM.getDataLayout()->getTypeAllocSize(ZCPV->getType());
@@ -305,7 +305,7 @@ bool SystemZAsmPrinter::PrintAsmOperand(const MachineInstr *MI,
   } else {
     SystemZMCInstLower Lower(MF->getContext(), *this);
     MCOperand MO(Lower.lowerOperand(MI->getOperand(OpNo)));
-    SystemZInstPrinter::printOperand(MO, OS);
+    SystemZInstPrinter::printOperand(MO, MAI, OS);
   }
   return false;
 }
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 24b5a41d7f67..91e12c2d9d7e 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -506,9 +506,10 @@ bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
     *Fast = true;
   return true;
 }
-  
+
 bool SystemZTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                  Type *Ty) const {
+                                                  Type *Ty,
+                                                  unsigned AS) const {
   // Punt on globals for now, although they can be used in limited
   // RELATIVE LONG cases.
   if (AM.BaseGV)
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index b001abc693d6..2f7617bbdac3 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -369,7 +369,8 @@ public:
   bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
   bool isLegalICmpImmediate(int64_t Imm) const override;
   bool isLegalAddImmediate(int64_t Imm) const override;
-  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
+                             unsigned AS) const override;
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
                                       unsigned Align,
                                       bool *Fast) const override;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 90598852b5ed..4346850e0ac5 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -752,10 +752,9 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   return nullptr;
 }
 
-MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                                      MachineInstr *MI,
-                                                      ArrayRef<unsigned> Ops,
-                                                      int FrameIndex) const {
+MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   unsigned Size = MFI->getObjectSize(FrameIndex);
   unsigned Opcode = MI->getOpcode();
@@ -765,9 +764,11 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
         isInt<8>(MI->getOperand(2).getImm()) &&
         !MI->getOperand(3).getReg()) {
       // LA(Y) %reg, CONST(%reg) -> AGSI %mem, CONST
-      return BuildMI(MF, MI->getDebugLoc(), get(SystemZ::AGSI))
-        .addFrameIndex(FrameIndex).addImm(0)
-        .addImm(MI->getOperand(2).getImm());
+      return BuildMI(*InsertPt->getParent(), InsertPt, MI->getDebugLoc(),
+                     get(SystemZ::AGSI))
+          .addFrameIndex(FrameIndex)
+          .addImm(0)
+          .addImm(MI->getOperand(2).getImm());
     }
     return nullptr;
   }
@@ -786,9 +787,11 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       isInt<8>(MI->getOperand(2).getImm())) {
     // A(G)HI %reg, CONST -> A(G)SI %mem, CONST
     Opcode = (Opcode == SystemZ::AHI ? SystemZ::ASI : SystemZ::AGSI);
-    return BuildMI(MF, MI->getDebugLoc(), get(Opcode))
-      .addFrameIndex(FrameIndex).addImm(0)
-      .addImm(MI->getOperand(2).getImm());
+    return BuildMI(*InsertPt->getParent(), InsertPt, MI->getDebugLoc(),
+                   get(Opcode))
+        .addFrameIndex(FrameIndex)
+        .addImm(0)
+        .addImm(MI->getOperand(2).getImm());
   }
 
   if (Opcode == SystemZ::LGDR || Opcode == SystemZ::LDGR) {
@@ -798,17 +801,23 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     // source register instead.
     if (OpNum == 0) {
       unsigned StoreOpcode = Op1IsGPR ? SystemZ::STG : SystemZ::STD;
-      return BuildMI(MF, MI->getDebugLoc(), get(StoreOpcode))
-        .addOperand(MI->getOperand(1)).addFrameIndex(FrameIndex)
-        .addImm(0).addReg(0);
+      return BuildMI(*InsertPt->getParent(), InsertPt, MI->getDebugLoc(),
+                     get(StoreOpcode))
+          .addOperand(MI->getOperand(1))
+          .addFrameIndex(FrameIndex)
+          .addImm(0)
+          .addReg(0);
     }
     // If we're spilling the source of an LDGR or LGDR, load the
     // destination register instead.
     if (OpNum == 1) {
       unsigned LoadOpcode = Op0IsGPR ? SystemZ::LG : SystemZ::LD;
       unsigned Dest = MI->getOperand(0).getReg();
-      return BuildMI(MF, MI->getDebugLoc(), get(LoadOpcode), Dest)
-        .addFrameIndex(FrameIndex).addImm(0).addReg(0);
+      return BuildMI(*InsertPt->getParent(), InsertPt, MI->getDebugLoc(),
+                     get(LoadOpcode), Dest)
+          .addFrameIndex(FrameIndex)
+          .addImm(0)
+          .addReg(0);
     }
   }
 
@@ -830,17 +839,25 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     if (MMO->getSize() == Size && !MMO->isVolatile()) {
       // Handle conversion of loads.
       if (isSimpleBD12Move(MI, SystemZII::SimpleBDXLoad)) {
-        return BuildMI(MF, MI->getDebugLoc(), get(SystemZ::MVC))
-          .addFrameIndex(FrameIndex).addImm(0).addImm(Size)
-          .addOperand(MI->getOperand(1)).addImm(MI->getOperand(2).getImm())
-          .addMemOperand(MMO);
+        return BuildMI(*InsertPt->getParent(), InsertPt, MI->getDebugLoc(),
+                       get(SystemZ::MVC))
+            .addFrameIndex(FrameIndex)
+            .addImm(0)
+            .addImm(Size)
+            .addOperand(MI->getOperand(1))
+            .addImm(MI->getOperand(2).getImm())
+            .addMemOperand(MMO);
       }
       // Handle conversion of stores.
       if (isSimpleBD12Move(MI, SystemZII::SimpleBDXStore)) {
-        return BuildMI(MF, MI->getDebugLoc(), get(SystemZ::MVC))
-          .addOperand(MI->getOperand(1)).addImm(MI->getOperand(2).getImm())
-          .addImm(Size).addFrameIndex(FrameIndex).addImm(0)
-          .addMemOperand(MMO);
+        return BuildMI(*InsertPt->getParent(), InsertPt, MI->getDebugLoc(),
+                       get(SystemZ::MVC))
+            .addOperand(MI->getOperand(1))
+            .addImm(MI->getOperand(2).getImm())
+            .addImm(Size)
+            .addFrameIndex(FrameIndex)
+            .addImm(0)
+            .addMemOperand(MMO);
       }
     }
   }
@@ -856,7 +873,8 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       assert(AccessBytes != 0 && "Size of access should be known");
       assert(AccessBytes <= Size && "Access outside the frame index");
       uint64_t Offset = Size - AccessBytes;
-      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(MemOpcode));
+      MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
+                                        MI->getDebugLoc(), get(MemOpcode));
       for (unsigned I = 0; I < OpNum; ++I)
         MIB.addOperand(MI->getOperand(I));
       MIB.addFrameIndex(FrameIndex).addImm(Offset);
@@ -869,10 +887,9 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   return nullptr;
 }
 
-MachineInstr *
-SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                        ArrayRef<unsigned> Ops,
-                                        MachineInstr *LoadMI) const {
+MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const {
   return nullptr;
 }
 
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index b55810b253f1..e47f2ee9d0b6 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -187,9 +187,11 @@ public:
                                       LiveVariables *LV) const override;
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
                                       ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
                                       int FrameIndex) const override;
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
                                       ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
                                       MachineInstr *LoadMI) const override;
   bool expandPostRAPseudo(MachineBasicBlock::iterator MBBI) const override;
   bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const
diff --git a/lib/Target/SystemZ/SystemZMCInstLower.cpp b/lib/Target/SystemZ/SystemZMCInstLower.cpp
index a1dcedab54e7..2655e4866b20 100644
--- a/lib/Target/SystemZ/SystemZMCInstLower.cpp
+++ b/lib/Target/SystemZ/SystemZMCInstLower.cpp
@@ -68,11 +68,11 @@ SystemZMCInstLower::getExpr(const MachineOperand &MO,
   default:
     llvm_unreachable("unknown operand type");
   }
-  const MCExpr *Expr = MCSymbolRefExpr::Create(Symbol, Kind, Ctx);
+  const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, Kind, Ctx);
   if (HasOffset)
     if (int64_t Offset = MO.getOffset()) {
-      const MCExpr *OffsetExpr = MCConstantExpr::Create(Offset, Ctx);
-      Expr = MCBinaryExpr::CreateAdd(Expr, OffsetExpr, Ctx);
+      const MCExpr *OffsetExpr = MCConstantExpr::create(Offset, Ctx);
+      Expr = MCBinaryExpr::createAdd(Expr, OffsetExpr, Ctx);
     }
   return Expr;
 }
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index a184b92d3c9f..d498bb104ef8 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -313,7 +313,7 @@ const MCExpr *TargetLoweringObjectFile::getTTypeGlobalReference(
     const TargetMachine &TM, MachineModuleInfo *MMI,
     MCStreamer &Streamer) const {
   const MCSymbolRefExpr *Ref =
-      MCSymbolRefExpr::Create(TM.getSymbol(GV, Mang), getContext());
+      MCSymbolRefExpr::create(TM.getSymbol(GV, Mang), getContext());
 
   return getTTypeReference(Ref, Encoding, Streamer);
 }
@@ -332,8 +332,8 @@ getTTypeReference(const MCSymbolRefExpr *Sym, unsigned Encoding,
     // .-foo addressing.
     MCSymbol *PCSym = getContext().createTempSymbol();
     Streamer.EmitLabel(PCSym);
-    const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext());
-    return MCBinaryExpr::CreateSub(Sym, PC, getContext());
+    const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
+    return MCBinaryExpr::createSub(Sym, PC, getContext());
   }
   }
 }
@@ -341,7 +341,7 @@ getTTypeReference(const MCSymbolRefExpr *Sym, unsigned Encoding,
 const MCExpr *TargetLoweringObjectFile::getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
   // FIXME: It's not clear what, if any, default this should have - perhaps a
   // null return could mean 'no location' & we should just do that here.
-  return MCSymbolRefExpr::Create(Sym, *Ctx);
+  return MCSymbolRefExpr::create(Sym, *Ctx);
 }
 
 void TargetLoweringObjectFile::getNameWithPrefix(
diff --git a/lib/Target/TargetRecip.cpp b/lib/Target/TargetRecip.cpp
new file mode 100644
index 000000000000..42bc487fe6d8
--- /dev/null
+++ b/lib/Target/TargetRecip.cpp
@@ -0,0 +1,225 @@
+//===-------------------------- TargetRecip.cpp ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class is used to customize machine-specific reciprocal estimate code
+// generation in a target-independent way.
+// If a target does not support operations in this specification, then code
+// generation will default to using supported operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetRecip.h"
+#include <map>
+
+using namespace llvm;
+
+// These are the names of the individual reciprocal operations. These are
+// the key strings for queries and command-line inputs.
+// In addition, the command-line interface recognizes the global parameters
+// "all", "none", and "default".
+static const char *RecipOps[] = {
+  "divd",
+  "divf",
+  "vec-divd",
+  "vec-divf",
+  "sqrtd",
+  "sqrtf",
+  "vec-sqrtd",
+  "vec-sqrtf",
+};
+
+// The uninitialized state is needed for the enabled settings and refinement
+// steps because custom settings may arrive via the command-line before target
+// defaults are set.
+TargetRecip::TargetRecip() {
+  unsigned NumStrings = llvm::array_lengthof(RecipOps);
+  for (unsigned i = 0; i < NumStrings; ++i)
+    RecipMap.insert(std::make_pair(RecipOps[i], RecipParams()));
+}
+
+static bool parseRefinementStep(const StringRef &In, size_t &Position,
+                                uint8_t &Value) {
+  const char RefStepToken = ':';
+  Position = In.find(RefStepToken);
+  if (Position == StringRef::npos)
+    return false;
+
+  StringRef RefStepString = In.substr(Position + 1);
+  // Allow exactly one numeric character for the additional refinement
+  // step parameter.
+  if (RefStepString.size() == 1) {
+    char RefStepChar = RefStepString[0];
+    if (RefStepChar >= '0' && RefStepChar <= '9') {
+      Value = RefStepChar - '0';
+      return true;
+    }
+  }
+  report_fatal_error("Invalid refinement step for -recip.");
+}
+
+bool TargetRecip::parseGlobalParams(const std::string &Arg) {
+  StringRef ArgSub = Arg;
+
+  // Look for an optional setting of the number of refinement steps needed
+  // for this type of reciprocal operation.
+  size_t RefPos;
+  uint8_t RefSteps;
+  StringRef RefStepString;
+  if (parseRefinementStep(ArgSub, RefPos, RefSteps)) {
+    // Split the string for further processing.
+    RefStepString = ArgSub.substr(RefPos + 1);
+    ArgSub = ArgSub.substr(0, RefPos);
+  }
+  bool Enable;
+  bool UseDefaults;
+  if (ArgSub == "all") {
+    UseDefaults = false;
+    Enable = true;
+  } else if (ArgSub == "none") {
+    UseDefaults = false;
+    Enable = false;
+  } else if (ArgSub == "default") {
+    UseDefaults = true;
+  } else {
+    // Any other string is invalid or an individual setting.
+    return false;
+  }
+
+  // All enable values will be initialized to target defaults if 'default' was
+  // specified.
+  if (!UseDefaults)
+    for (auto &KV : RecipMap)
+      KV.second.Enabled = Enable;
+
+  // Custom refinement count was specified with all, none, or default.
+  if (!RefStepString.empty())
+    for (auto &KV : RecipMap)
+      KV.second.RefinementSteps = RefSteps;
+  
+  return true;
+}
+
+void TargetRecip::parseIndividualParams(const std::vector<std::string> &Args) {
+  static const char DisabledPrefix = '!';
+  unsigned NumArgs = Args.size();
+
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    StringRef Val = Args[i];
+    
+    bool IsDisabled = Val[0] == DisabledPrefix;
+    // Ignore the disablement token for string matching.
+    if (IsDisabled)
+      Val = Val.substr(1);
+    
+    size_t RefPos;
+    uint8_t RefSteps;
+    StringRef RefStepString;
+    if (parseRefinementStep(Val, RefPos, RefSteps)) {
+      // Split the string for further processing.
+      RefStepString = Val.substr(RefPos + 1);
+      Val = Val.substr(0, RefPos);
+    }
+
+    RecipIter Iter = RecipMap.find(Val);
+    if (Iter == RecipMap.end()) {
+      // Try again specifying float suffix.
+      Iter = RecipMap.find(Val.str() + 'f');
+      if (Iter == RecipMap.end()) {
+        Iter = RecipMap.find(Val.str() + 'd');
+        assert(Iter == RecipMap.end() && "Float entry missing from map");
+        report_fatal_error("Invalid option for -recip.");
+      }
+      
+      // The option was specified without a float or double suffix.
+      if (RecipMap[Val.str() + 'd'].Enabled != Uninitialized) {
+        // Make sure that the double entry was not already specified.
+        // The float entry will be checked below.
+        report_fatal_error("Duplicate option for -recip.");
+      }
+    }
+    
+    if (Iter->second.Enabled != Uninitialized)
+      report_fatal_error("Duplicate option for -recip.");
+    
+    // Mark the matched option as found. Do not allow duplicate specifiers.
+    Iter->second.Enabled = !IsDisabled;
+    if (!RefStepString.empty())
+      Iter->second.RefinementSteps = RefSteps;
+    
+    // If the precision was not specified, the double entry is also initialized.
+    if (Val.back() != 'f' && Val.back() != 'd') {
+      RecipMap[Val.str() + 'd'].Enabled = !IsDisabled;
+      if (!RefStepString.empty())
+        RecipMap[Val.str() + 'd'].RefinementSteps = RefSteps;
+    }
+  }
+}
+
+TargetRecip::TargetRecip(const std::vector<std::string> &Args) :
+  TargetRecip() {
+  unsigned NumArgs = Args.size();
+
+  // Check if "all", "default", or "none" was specified.
+  if (NumArgs == 1 && parseGlobalParams(Args[0]))
+    return;
+ 
+  parseIndividualParams(Args);
+}
+
+bool TargetRecip::isEnabled(const StringRef &Key) const {
+  ConstRecipIter Iter = RecipMap.find(Key);
+  assert(Iter != RecipMap.end() && "Unknown name for reciprocal map");
+  assert(Iter->second.Enabled != Uninitialized &&
+         "Enablement setting was not initialized");
+  return Iter->second.Enabled;
+}
+
+unsigned TargetRecip::getRefinementSteps(const StringRef &Key) const {
+  ConstRecipIter Iter = RecipMap.find(Key);
+  assert(Iter != RecipMap.end() && "Unknown name for reciprocal map");
+  assert(Iter->second.RefinementSteps != Uninitialized &&
+         "Refinement step setting was not initialized");
+  return Iter->second.RefinementSteps;
+}
+
+/// Custom settings (previously initialized values) override target defaults.
+void TargetRecip::setDefaults(const StringRef &Key, bool Enable,
+                              unsigned RefSteps) {
+  if (Key == "all") {
+    for (auto &KV : RecipMap) {
+      RecipParams &RP = KV.second;
+      if (RP.Enabled == Uninitialized)
+        RP.Enabled = Enable;
+      if (RP.RefinementSteps == Uninitialized)
+        RP.RefinementSteps = RefSteps;
+    }
+  } else {
+    RecipParams &RP = RecipMap[Key];
+    if (RP.Enabled == Uninitialized)
+      RP.Enabled = Enable;
+    if (RP.RefinementSteps == Uninitialized)
+      RP.RefinementSteps = RefSteps;
+  }
+}
+
+bool TargetRecip::operator==(const TargetRecip &Other) const {
+  for (const auto &KV : RecipMap) {
+    const StringRef &Op = KV.first;
+    const RecipParams &RP = KV.second;
+    const RecipParams &OtherRP = Other.RecipMap.find(Op)->second;
+    if (RP.RefinementSteps != OtherRP.RefinementSteps)
+      return false;
+    if (RP.Enabled != OtherRP.Enabled)
+      return false;
+  }
+  return true;
+}
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
index a21f8c723503..9eee4a0f3d82 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -315,7 +315,7 @@ void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg,
 
   // Test (%SrcReg)
   {
-    const MCExpr *Disp = MCConstantExpr::Create(0, Ctx);
+    const MCExpr *Disp = MCConstantExpr::create(0, Ctx);
     std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
         getPointerWidth(), 0, Disp, SrcReg, 0, AccessSize, SMLoc(), SMLoc()));
     InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx,
@@ -324,7 +324,7 @@ void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg,
 
   // Test -1(%SrcReg, %CntReg, AccessSize)
   {
-    const MCExpr *Disp = MCConstantExpr::Create(-1, Ctx);
+    const MCExpr *Disp = MCConstantExpr::create(-1, Ctx);
     std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
         getPointerWidth(), 0, Disp, SrcReg, CntReg, AccessSize, SMLoc(),
         SMLoc()));
@@ -334,7 +334,7 @@ void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg,
 
   // Test (%DstReg)
   {
-    const MCExpr *Disp = MCConstantExpr::Create(0, Ctx);
+    const MCExpr *Disp = MCConstantExpr::create(0, Ctx);
     std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
         getPointerWidth(), 0, Disp, DstReg, 0, AccessSize, SMLoc(), SMLoc()));
     InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out);
@@ -342,7 +342,7 @@ void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg,
 
   // Test -1(%DstReg, %CntReg, AccessSize)
   {
-    const MCExpr *Disp = MCConstantExpr::Create(-1, Ctx);
+    const MCExpr *Disp = MCConstantExpr::create(-1, Ctx);
     std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
         getPointerWidth(), 0, Disp, DstReg, CntReg, AccessSize, SMLoc(),
         SMLoc()));
@@ -461,7 +461,7 @@ void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op,
 
   while (Residue != 0) {
     const MCConstantExpr *Disp =
-        MCConstantExpr::Create(ApplyDisplacementBounds(Residue), Ctx);
+        MCConstantExpr::create(ApplyDisplacementBounds(Residue), Ctx);
     std::unique_ptr<X86Operand> DispOp =
         X86Operand::CreateMem(getPointerWidth(), 0, Disp, Reg, 0, 1, SMLoc(),
                               SMLoc());
@@ -493,7 +493,7 @@ X86AddressSanitizer::AddDisplacement(X86Operand &Op, int64_t Displacement,
   CheckDisplacementBounds(NewDisplacement);
 
   *Residue = Displacement - NewDisplacement;
-  const MCExpr *Disp = MCConstantExpr::Create(NewDisplacement, Ctx);
+  const MCExpr *Disp = MCConstantExpr::create(NewDisplacement, Ctx);
   return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(), Disp,
                                Op.getMemBaseReg(), Op.getMemIndexReg(),
                                Op.getMemScale(), SMLoc(), SMLoc());
@@ -615,7 +615,7 @@ private:
     const std::string &Fn = FuncName(AccessSize, IsWrite);
     MCSymbol *FnSym = Ctx.getOrCreateSymbol(StringRef(Fn));
     const MCSymbolRefExpr *FnExpr =
-        MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
+        MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
     EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr));
   }
 };
@@ -643,7 +643,7 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall(
     MCInst Inst;
     Inst.setOpcode(X86::MOV8rm);
     Inst.addOperand(MCOperand::createReg(ShadowRegI8));
-    const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
+    const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
     std::unique_ptr<X86Operand> Op(
         X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1,
                               SMLoc(), SMLoc()));
@@ -654,7 +654,7 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall(
   EmitInstruction(
       Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8));
   MCSymbol *DoneSym = Ctx.createTempSymbol();
-  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+  const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
   EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
 
   EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg(
@@ -669,7 +669,7 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall(
   case 1:
     break;
   case 2: {
-    const MCExpr *Disp = MCConstantExpr::Create(1, Ctx);
+    const MCExpr *Disp = MCConstantExpr::create(1, Ctx);
     std::unique_ptr<X86Operand> Op(
         X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
                               SMLoc(), SMLoc()));
@@ -720,7 +720,7 @@ void X86AddressSanitizer32::InstrumentMemOperandLarge(
       Inst.setOpcode(X86::CMP16mi);
       break;
     }
-    const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
+    const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
     std::unique_ptr<X86Operand> Op(
         X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1,
                               SMLoc(), SMLoc()));
@@ -729,7 +729,7 @@ void X86AddressSanitizer32::InstrumentMemOperandLarge(
     EmitInstruction(Out, Inst);
   }
   MCSymbol *DoneSym = Ctx.createTempSymbol();
-  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+  const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
   EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
 
   EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
@@ -743,7 +743,7 @@ void X86AddressSanitizer32::InstrumentMOVSImpl(unsigned AccessSize,
 
   // No need to test when ECX is equals to zero.
   MCSymbol *DoneSym = Ctx.createTempSymbol();
-  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+  const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
   EmitInstruction(
       Out, MCInstBuilder(X86::TEST32rr).addReg(X86::ECX).addReg(X86::ECX));
   EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
@@ -860,7 +860,7 @@ public:
 
 private:
   void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) {
-    const MCExpr *Disp = MCConstantExpr::Create(Offset, Ctx);
+    const MCExpr *Disp = MCConstantExpr::create(Offset, Ctx);
     std::unique_ptr<X86Operand> Op(
         X86Operand::CreateMem(getPointerWidth(), 0, Disp, X86::RSP, 0, 1,
                               SMLoc(), SMLoc()));
@@ -885,7 +885,7 @@ private:
     const std::string &Fn = FuncName(AccessSize, IsWrite);
     MCSymbol *FnSym = Ctx.getOrCreateSymbol(StringRef(Fn));
     const MCSymbolRefExpr *FnExpr =
-        MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
+        MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
     EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr));
   }
 };
@@ -914,7 +914,7 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall(
     MCInst Inst;
     Inst.setOpcode(X86::MOV8rm);
     Inst.addOperand(MCOperand::createReg(ShadowRegI8));
-    const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
+    const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
     std::unique_ptr<X86Operand> Op(
         X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1,
                               SMLoc(), SMLoc()));
@@ -925,7 +925,7 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall(
   EmitInstruction(
       Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8));
   MCSymbol *DoneSym = Ctx.createTempSymbol();
-  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+  const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
   EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
 
   EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg(
@@ -940,7 +940,7 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall(
   case 1:
     break;
   case 2: {
-    const MCExpr *Disp = MCConstantExpr::Create(1, Ctx);
+    const MCExpr *Disp = MCConstantExpr::create(1, Ctx);
     std::unique_ptr<X86Operand> Op(
         X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1,
                               SMLoc(), SMLoc()));
@@ -991,7 +991,7 @@ void X86AddressSanitizer64::InstrumentMemOperandLarge(
       Inst.setOpcode(X86::CMP16mi);
       break;
     }
-    const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
+    const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx);
     std::unique_ptr<X86Operand> Op(
         X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1,
                               SMLoc(), SMLoc()));
@@ -1001,7 +1001,7 @@ void X86AddressSanitizer64::InstrumentMemOperandLarge(
   }
 
   MCSymbol *DoneSym = Ctx.createTempSymbol();
-  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+  const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
   EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
 
   EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
@@ -1015,7 +1015,7 @@ void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize,
 
   // No need to test when RCX is equals to zero.
   MCSymbol *DoneSym = Ctx.createTempSymbol();
-  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+  const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx);
   EmitInstruction(
       Out, MCInstBuilder(X86::TEST64rr).addReg(X86::RCX).addReg(X86::RCX));
   EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr));
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 3047fd1078a9..e8965710f022 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -973,7 +973,7 @@ void X86AsmParser::SetFrameRegister(unsigned RegNo) {
 std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
   unsigned basereg =
     is64BitMode() ? X86::RSI : (is32BitMode() ? X86::ESI : X86::SI);
-  const MCExpr *Disp = MCConstantExpr::Create(0, getContext());
+  const MCExpr *Disp = MCConstantExpr::create(0, getContext());
   return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
                                /*BaseReg=*/basereg, /*IndexReg=*/0, /*Scale=*/1,
                                Loc, Loc, 0);
@@ -982,7 +982,7 @@ std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
 std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
   unsigned basereg =
     is64BitMode() ? X86::RDI : (is32BitMode() ? X86::EDI : X86::DI);
-  const MCExpr *Disp = MCConstantExpr::Create(0, getContext());
+  const MCExpr *Disp = MCConstantExpr::create(0, getContext());
   return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
                                /*BaseReg=*/basereg, /*IndexReg=*/0, /*Scale=*/1,
                                Loc, Loc, 0);
@@ -1195,7 +1195,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
               getContext().getDirectionalLocalSymbol(IntVal, IDVal == "b");
           MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
           const MCExpr *Val =
-	    MCSymbolRefExpr::Create(Sym, Variant, getContext());
+	    MCSymbolRefExpr::create(Sym, Variant, getContext());
           if (IDVal == "b" && Sym->isUndefined())
             return Error(Loc, "invalid reference to undefined symbol");
           StringRef Identifier = Sym->getName();
@@ -1265,9 +1265,9 @@ X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
   }
 
   if (SM.getImm() || !Disp) {
-    const MCExpr *Imm = MCConstantExpr::Create(SM.getImm(), getContext());
+    const MCExpr *Imm = MCConstantExpr::create(SM.getImm(), getContext());
     if (Disp)
-      Disp = MCBinaryExpr::CreateAdd(Disp, Imm, getContext());
+      Disp = MCBinaryExpr::createAdd(Disp, Imm, getContext());
     else
       Disp = Imm;  // An immediate displacement only.
   }
@@ -1354,7 +1354,7 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
   // Create the symbol reference.
   MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
   MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
-  Val = MCSymbolRefExpr::Create(Sym, Variant, getParser().getContext());
+  Val = MCSymbolRefExpr::create(Sym, Variant, getParser().getContext());
   return false;
 }
 
@@ -1382,7 +1382,7 @@ X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start,
       // An immediate following a 'segment register', 'colon' token sequence can
       // be followed by a bracketed expression.  If it isn't we know we have our
       // final segment override.
-      const MCExpr *Disp = MCConstantExpr::Create(ImmDisp, getContext());
+      const MCExpr *Disp = MCConstantExpr::create(ImmDisp, getContext());
       return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp,
                                    /*BaseReg=*/0, /*IndexReg=*/0, /*Scale=*/1,
                                    Start, ImmDispToken.getEndLoc(), Size);
@@ -1435,7 +1435,7 @@ X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) {
       return ErrorOperand(Tok.getLoc(), "Expected } at this point");
     Parser.Lex();  // Eat "}"
     const MCExpr *RndModeOp =
-      MCConstantExpr::Create(rndMode, Parser.getContext());
+      MCConstantExpr::create(rndMode, Parser.getContext());
     return X86Operand::CreateImm(RndModeOp, Start, End);
   }
   if(Tok.getIdentifier().equals("sae")){
@@ -1499,7 +1499,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp,
     return nullptr;
   }
 
-  const MCExpr *Disp = MCConstantExpr::Create(SM.getImm(), getContext());
+  const MCExpr *Disp = MCConstantExpr::create(SM.getImm(), getContext());
   // BaseReg is non-zero to avoid assertions.  In the context of inline asm,
   // we're pointing to a local variable in memory, so the base register is
   // really the frame or stack pointer.
@@ -1549,7 +1549,7 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
                                                 Val));
   }
 
-  NewDisp = MCConstantExpr::Create(OrigDispVal + DotDispVal, getContext());
+  NewDisp = MCConstantExpr::create(OrigDispVal + DotDispVal, getContext());
   return false;
 }
 
@@ -1623,7 +1623,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   unsigned Len = End.getPointer() - TypeLoc.getPointer();
   InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, TypeLoc, Len, CVal));
 
-  const MCExpr *Imm = MCConstantExpr::Create(CVal, getContext());
+  const MCExpr *Imm = MCConstantExpr::create(CVal, getContext());
   return X86Operand::CreateImm(Imm, Start, End);
 }
 
@@ -1683,7 +1683,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
         return X86Operand::CreateMem(getPointerWidth(), SM.getSym(), Start, End,
                                      Size);
 
-      const MCExpr *ImmExpr = MCConstantExpr::Create(Imm, getContext());
+      const MCExpr *ImmExpr = MCConstantExpr::create(Imm, getContext());
       return X86Operand::CreateImm(ImmExpr, Start, End);
     }
 
@@ -1841,7 +1841,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
   // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)".  The
   // only way to do this without lookahead is to eat the '(' and see what is
   // after it.
-  const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
+  const MCExpr *Disp = MCConstantExpr::create(0, getParser().getContext());
   if (getLexer().isNot(AsmToken::LParen)) {
     SMLoc ExprEnd;
     if (getParser().parseExpression(Disp, ExprEnd)) return nullptr;
@@ -2061,7 +2061,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
       Operands.push_back(X86Operand::CreateToken(PatchedName.slice(0, CCIdx),
                                                  NameLoc));
 
-      const MCExpr *ImmOp = MCConstantExpr::Create(ComparisonCode,
+      const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
                                                    getParser().getContext());
       Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
 
@@ -2088,7 +2088,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     if (ComparisonCode != ~0U && (ComparisonCode != 0 || CCIdx == 2)) {
       Operands.push_back(X86Operand::CreateToken("vpcmp", NameLoc));
 
-      const MCExpr *ImmOp = MCConstantExpr::Create(ComparisonCode,
+      const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
                                                    getParser().getContext());
       Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
 
@@ -2115,7 +2115,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     if (ComparisonCode != ~0U) {
       Operands.push_back(X86Operand::CreateToken("vpcom", NameLoc));
 
-      const MCExpr *ImmOp = MCConstantExpr::Create(ComparisonCode,
+      const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode,
                                                    getParser().getContext());
       Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc));
 
@@ -2375,7 +2375,7 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
     X86Operand &Op = static_cast<X86Operand &>(*Ops[1]);
     assert(Op.isImm() && "expected immediate");
     int64_t Res;
-    if (!Op.getImm()->EvaluateAsAbsolute(Res) || Res > 255) {
+    if (!Op.getImm()->evaluateAsAbsolute(Res) || Res > 255) {
       Error(Op.getStartLoc(), "interrupt vector must be in range [0-255]");
       return false;
     }
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 3469d19f4fd2..6e99c37c2bc7 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -546,6 +546,8 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
   case TYPE_XMM512:
     mcInst.addOperand(MCOperand::createReg(X86::ZMM0 + (immediate >> 4)));
     return;
+  case TYPE_BNDR:
+    mcInst.addOperand(MCOperand::createReg(X86::BND0 + (immediate >> 4)));
   case TYPE_REL8:
     isBranch = true;
     pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
@@ -827,6 +829,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
   case TYPE_VK16:
   case TYPE_DEBUGREG:
   case TYPE_CONTROLREG:
+  case TYPE_BNDR:
     return translateRMRegister(mcInst, insn);
   case TYPE_M:
   case TYPE_M8:
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index 9e6505001393..301db72feafb 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -460,6 +460,7 @@ enum OperandEncoding {
   ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand")                      \
   ENUM_ENTRY(TYPE_DEBUGREG,   "Debug register operand")                        \
   ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand")                      \
+  ENUM_ENTRY(TYPE_BNDR,       "MPX bounds register")                           \
                                                                                \
   ENUM_ENTRY(TYPE_Mv,         "Memory operand of operand size")                \
   ENUM_ENTRY(TYPE_Rv,         "Register operand of operand size")              \
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index af4399a41a06..ea727e6e82fb 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -150,11 +150,11 @@ void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
     // that address in hex.
     const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
     int64_t Address;
-    if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+    if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
       O << formatHex((uint64_t)Address);
     } else {
       // Otherwise, just print the expression.
-      O << *Op.getExpr();
+      Op.getExpr()->print(O, &MAI);
     }
   }
 }
@@ -178,7 +178,9 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
 
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
-    O << markup("<imm:") << '$' << *Op.getExpr() << markup(">");
+    O << markup("<imm:") << '$';
+    Op.getExpr()->print(O, &MAI);
+    O << markup(">");
   }
 }
 
@@ -203,7 +205,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
       O << formatImm(DispVal);
   } else {
     assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
-    O << *DispSpec.getExpr();
+    DispSpec.getExpr()->print(O, &MAI);
   }
 
   if (IndexReg.getReg() || BaseReg.getReg()) {
@@ -273,7 +275,7 @@ void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
     O << formatImm(DispSpec.getImm());
   } else {
     assert(DispSpec.isExpr() && "non-immediate displacement?");
-    O << *DispSpec.getExpr();
+    DispSpec.getExpr()->print(O, &MAI);
   }
 
   O << markup(">");
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 4d92dafa938a..879378fc7a97 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -131,12 +131,12 @@ void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
     // that address in hex.
     const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
     int64_t Address;
-    if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+    if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
       O << formatHex((uint64_t)Address);
     }
     else {
       // Otherwise, just print the expression.
-      O << *Op.getExpr();
+      Op.getExpr()->print(O, &MAI);
     }
   }
 }
@@ -150,7 +150,7 @@ void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     O << formatImm((int64_t)Op.getImm());
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
-    O << *Op.getExpr();
+    Op.getExpr()->print(O, &MAI);
   }
 }
 
@@ -187,7 +187,7 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
   if (!DispSpec.isImm()) {
     if (NeedPlus) O << " + ";
     assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
-    O << *DispSpec.getExpr();
+    DispSpec.getExpr()->print(O, &MAI);
   } else {
     int64_t DispVal = DispSpec.getImm();
     if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
@@ -245,7 +245,7 @@ void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
     O << formatImm(DispSpec.getImm());
   } else {
     assert(DispSpec.isExpr() && "non-immediate displacement?");
-    O << *DispSpec.getExpr();
+    DispSpec.getExpr()->print(O, &MAI);
   }
 
   O << ']';
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 6d4284dc518b..1ac656d4614b 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -326,7 +326,7 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   // FIXME: We could generated something better than plain 0x90.
   if (!HasNopl) {
     for (uint64_t i = 0; i < Count; ++i)
-      OW->Write8(0x90);
+      OW->write8(0x90);
     return true;
   }
 
@@ -336,10 +336,10 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
     const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength);
     const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
     for (uint8_t i = 0; i < Prefixes; i++)
-      OW->Write8(0x66);
+      OW->write8(0x66);
     const uint8_t Rest = ThisNopLength - Prefixes;
     for (uint8_t i = 0; i < Rest; i++)
-      OW->Write8(Nops[Rest - 1][i]);
+      OW->write8(Nops[Rest - 1][i]);
     Count -= ThisNopLength;
   } while (Count != 0);
 
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index 45088835cfb9..a33468dc4769 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -66,6 +66,7 @@ static X86_64RelType getType64(unsigned Kind,
   case X86::reloc_riprel_4byte:
   case X86::reloc_riprel_4byte_movq_load:
     return RT64_32;
+  case FK_PCRel_2:
   case FK_Data_2:
     return RT64_16;
   case FK_PCRel_1:
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
index a39def98e48e..2943dd383efa 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
@@ -31,13 +31,13 @@ public:
 
     StringRef SymName; SymI->getName(SymName);
     uint64_t  SymAddr; SymI->getAddress(SymAddr);
-    uint64_t  SymSize; SymI->getSize(SymSize);
+    uint64_t SymSize = SymI->getSize();
     int64_t  Addend;  getELFRelocationAddend(Rel, Addend);
 
     MCSymbol *Sym = Ctx.getOrCreateSymbol(SymName);
     // FIXME: check that the value is actually the same.
     if (!Sym->isVariable())
-      Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx));
+      Sym->setVariableValue(MCConstantExpr::create(SymAddr, Ctx));
 
     const MCExpr *Expr = nullptr;
     // If hasAddend is true, then we need to add Addend (r_addend) to Expr.
@@ -76,7 +76,7 @@ public:
     case R_X86_64_PC64:
       // S + A - P (P/pcrel is implicit)
       hasAddend = true;
-      Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      Expr = MCSymbolRefExpr::create(Sym, Ctx);
       break;
     case R_X86_64_GOT32:
     case R_X86_64_GOT64:
@@ -85,27 +85,27 @@ public:
     case R_X86_64_GOTPLT64:
       // G + A
       hasAddend = true;
-      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Ctx);
+      Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, Ctx);
       break;
     case R_X86_64_PLT32:
       // L + A - P -> S@PLT + A
       hasAddend = true;
-      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_PLT, Ctx);
+      Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_PLT, Ctx);
       break;
     case R_X86_64_GLOB_DAT:
     case R_X86_64_JUMP_SLOT:
       // S
-      Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      Expr = MCSymbolRefExpr::create(Sym, Ctx);
       break;
     case R_X86_64_GOTPCREL:
     case R_X86_64_GOTPCREL64:
       // G + GOT + A - P -> S@GOTPCREL + A
       hasAddend = true;
-      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
+      Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
       break;
     case R_X86_64_GOTOFF64:
       // S + A - GOT
-      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTOFF, Ctx);
+      Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTOFF, Ctx);
       break;
     case R_X86_64_PLTOFF64:
       // L + A - GOT
@@ -113,15 +113,15 @@ public:
     case R_X86_64_SIZE32:
     case R_X86_64_SIZE64:
       // Z + A
-      Expr = MCConstantExpr::Create(SymSize, Ctx);
+      Expr = MCConstantExpr::create(SymSize, Ctx);
       break;
     default:
-      Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      Expr = MCSymbolRefExpr::create(Sym, Ctx);
       break;
     }
     if (Expr && hasAddend && Addend != 0)
-      Expr = MCBinaryExpr::CreateAdd(Expr,
-                                     MCConstantExpr::Create(Addend, Ctx),
+      Expr = MCBinaryExpr::createAdd(Expr,
+                                     MCConstantExpr::create(Addend, Ctx),
                                      Ctx);
     return Expr;
   }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index bda35f2b9726..fc0b0f89e23d 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -119,9 +119,9 @@ X86_64MCAsmInfoDarwin::getExprForPersonalitySymbol(const MCSymbol *Sym,
                                                    MCStreamer &Streamer) const {
   MCContext &Context = Streamer.getContext();
   const MCExpr *Res =
-    MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Context);
-  const MCExpr *Four = MCConstantExpr::Create(4, Context);
-  return MCBinaryExpr::CreateAdd(Res, Four, Context);
+    MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Context);
+  const MCExpr *Four = MCConstantExpr::create(4, Context);
+  return MCBinaryExpr::createAdd(Res, Four, Context);
 }
 
 void X86MCAsmInfoMicrosoft::anchor() { }
@@ -132,6 +132,11 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
     PrivateLabelPrefix = ".L";
     PointerSize = 8;
     WinEHEncodingType = WinEH::EncodingType::Itanium;
+  } else {
+    // 32-bit X86 doesn't use CFI, so this isn't a real encoding type. It's just
+    // a place holder that the Windows EHStreamer looks for to suppress CFI
+    // output. In particular, usesWindowsCFI() returns false.
+    WinEHEncodingType = WinEH::EncodingType::X86;
   }
 
   ExceptionsType = ExceptionHandling::WinEH;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 8aed7a4d9eb9..10c434c8b1b4 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -304,7 +304,7 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
       EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS);
       return;
     }
-    Expr = MCConstantExpr::Create(DispOp.getImm(), Ctx);
+    Expr = MCConstantExpr::create(DispOp.getImm(), Ctx);
   } else {
     Expr = DispOp.getExpr();
   }
@@ -351,7 +351,7 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
     ImmOffset -= 1;
 
   if (ImmOffset)
-    Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(ImmOffset, Ctx),
+    Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(ImmOffset, Ctx),
                                    Ctx);
 
   // Emit a symbolic constant as a fixup and 4 zeros.
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 8e3c72158fc0..cc98e55dc695 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -115,8 +115,8 @@ static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) {
   return X;
 }
 
-static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
-  Triple TheTriple(TT);
+static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI,
+                                     const Triple &TheTriple) {
   bool is64Bit = TheTriple.getArch() == Triple::x86_64;
 
   MCAsmInfo *MAI;
diff --git a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
index 6cf5af7217f9..a5aadd6a385e 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
@@ -39,33 +39,33 @@ public:
     MCSymbol *Sym = Ctx.getOrCreateSymbol(SymName);
     // FIXME: check that the value is actually the same.
     if (!Sym->isVariable())
-      Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx));
+      Sym->setVariableValue(MCConstantExpr::create(SymAddr, Ctx));
     const MCExpr *Expr = nullptr;
 
     switch(RelType) {
     case X86_64_RELOC_TLV:
-      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
+      Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
       break;
     case X86_64_RELOC_SIGNED_4:
-      Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
-                                     MCConstantExpr::Create(4, Ctx),
+      Expr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Sym, Ctx),
+                                     MCConstantExpr::create(4, Ctx),
                                      Ctx);
       break;
     case X86_64_RELOC_SIGNED_2:
-      Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
-                                     MCConstantExpr::Create(2, Ctx),
+      Expr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Sym, Ctx),
+                                     MCConstantExpr::create(2, Ctx),
                                      Ctx);
       break;
     case X86_64_RELOC_SIGNED_1:
-      Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
-                                     MCConstantExpr::Create(1, Ctx),
+      Expr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Sym, Ctx),
+                                     MCConstantExpr::create(1, Ctx),
                                      Ctx);
       break;
     case X86_64_RELOC_GOT_LOAD:
-      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
+      Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
       break;
     case X86_64_RELOC_GOT:
-      Expr = MCSymbolRefExpr::Create(Sym, isPCRel ?
+      Expr = MCSymbolRefExpr::create(Sym, isPCRel ?
                                      MCSymbolRefExpr::VK_GOTPCREL :
                                      MCSymbolRefExpr::VK_GOT,
                                      Ctx);
@@ -84,7 +84,7 @@ public:
           report_fatal_error("Expected X86_64_RELOC_UNSIGNED after "
                              "X86_64_RELOC_SUBTRACTOR.");
 
-        const MCExpr *LHS = MCSymbolRefExpr::Create(Sym, Ctx);
+        const MCExpr *LHS = MCSymbolRefExpr::create(Sym, Ctx);
 
         symbol_iterator RSymI = Rel.getSymbol();
         uint64_t RSymAddr;
@@ -94,15 +94,15 @@ public:
 
         MCSymbol *RSym = Ctx.getOrCreateSymbol(RSymName);
         if (!RSym->isVariable())
-          RSym->setVariableValue(MCConstantExpr::Create(RSymAddr, Ctx));
+          RSym->setVariableValue(MCConstantExpr::create(RSymAddr, Ctx));
 
-        const MCExpr *RHS = MCSymbolRefExpr::Create(RSym, Ctx);
+        const MCExpr *RHS = MCSymbolRefExpr::create(RSym, Ctx);
 
-        Expr = MCBinaryExpr::CreateSub(LHS, RHS, Ctx);
+        Expr = MCBinaryExpr::createSub(LHS, RHS, Ctx);
         break;
       }
     default:
-      Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      Expr = MCSymbolRefExpr::create(Sym, Ctx);
       break;
     }
     return Expr;
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 9da3e1fc36bf..95acc07192da 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -25,7 +25,7 @@ using namespace llvm;
 
 namespace {
 class X86MachObjectWriter : public MCMachObjectTargetWriter {
-  bool RecordScatteredRelocation(MachObjectWriter *Writer,
+  bool recordScatteredRelocation(MachObjectWriter *Writer,
                                  const MCAssembler &Asm,
                                  const MCAsmLayout &Layout,
                                  const MCFragment *Fragment,
@@ -33,7 +33,7 @@ class X86MachObjectWriter : public MCMachObjectTargetWriter {
                                  MCValue Target,
                                  unsigned Log2Size,
                                  uint64_t &FixedValue);
-  void RecordTLVPRelocation(MachObjectWriter *Writer,
+  void recordTLVPRelocation(MachObjectWriter *Writer,
                             const MCAssembler &Asm,
                             const MCAsmLayout &Layout,
                             const MCFragment *Fragment,
@@ -54,12 +54,10 @@ class X86MachObjectWriter : public MCMachObjectTargetWriter {
                               MCValue Target, uint64_t &FixedValue);
 
 public:
-  X86MachObjectWriter(bool Is64Bit, uint32_t CPUType,
-                      uint32_t CPUSubtype)
-    : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
-                               /*UseAggressiveSymbolFolding=*/Is64Bit) {}
+  X86MachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype)
+      : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {}
 
-  void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
+  void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
                         const MCAsmLayout &Layout, const MCFragment *Fragment,
                         const MCFixup &Fixup, MCValue Target,
                         uint64_t &FixedValue) override {
@@ -142,13 +140,11 @@ void X86MachObjectWriter::RecordX86_64Relocation(
     const MCSymbol *A = &Target.getSymA()->getSymbol();
     if (A->isTemporary())
       A = &Writer->findAliasedSymbol(*A);
-    const MCSymbolData &A_SD = Asm.getSymbolData(*A);
     const MCSymbol *A_Base = Asm.getAtom(*A);
 
     const MCSymbol *B = &Target.getSymB()->getSymbol();
     if (B->isTemporary())
       B = &Writer->findAliasedSymbol(*B);
-    const MCSymbolData &B_SD = Asm.getSymbolData(*B);
     const MCSymbol *B_Base = Asm.getAtom(*B);
 
     // Neither symbol can be modified.
@@ -190,7 +186,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
              (!B_Base ? 0 : Writer->getSymbolAddress(*B_Base, Layout));
 
     if (!A_Base)
-      Index = A_SD.getFragment()->getParent()->getOrdinal() + 1;
+      Index = A->getFragment()->getParent()->getOrdinal() + 1;
     Type = MachO::X86_64_RELOC_UNSIGNED;
 
     MachO::any_relocation_info MRE;
@@ -202,7 +198,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
     if (B_Base)
       RelSymbol = B_Base;
     else
-      Index = B_SD.getFragment()->getParent()->getOrdinal() + 1;
+      Index = B->getFragment()->getParent()->getOrdinal() + 1;
     Type = MachO::X86_64_RELOC_SUBTRACTOR;
   } else {
     const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
@@ -211,7 +207,6 @@ void X86MachObjectWriter::RecordX86_64Relocation(
       if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec))
         Asm.addLocalUsedInReloc(*Symbol);
     }
-    const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
     RelSymbol = Asm.getAtom(*Symbol);
 
     // Relocations inside debug sections always use local relocations when
@@ -235,7 +230,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
                  Layout.getSymbolOffset(*RelSymbol);
     } else if (Symbol->isInSection() && !Symbol->isVariable()) {
       // The index is the section ordinal (1-based).
-      Index = SD.getFragment()->getParent()->getOrdinal() + 1;
+      Index = Symbol->getFragment()->getParent()->getOrdinal() + 1;
       Value += Writer->getSymbolAddress(*Symbol, Layout);
 
       if (IsPCRel)
@@ -243,7 +238,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
     } else if (Symbol->isVariable()) {
       const MCExpr *Value = Symbol->getVariableValue();
       int64_t Res;
-      bool isAbs = Value->EvaluateAsAbsolute(Res, Layout,
+      bool isAbs = Value->evaluateAsAbsolute(Res, Layout,
                                              Writer->getSectionAddressMap());
       if (isAbs) {
         FixedValue = Res;
@@ -339,7 +334,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
   Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
 }
 
-bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
+bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
                                                     const MCAssembler &Asm,
                                                     const MCAsmLayout &Layout,
                                                     const MCFragment *Fragment,
@@ -354,23 +349,21 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
-  const MCSymbolData *A_SD = &Asm.getSymbolData(*A);
 
-  if (!A_SD->getFragment())
+  if (!A->getFragment())
     report_fatal_error("symbol '" + A->getName() +
                        "' can not be undefined in a subtraction expression",
                        false);
 
   uint32_t Value = Writer->getSymbolAddress(*A, Layout);
-  uint64_t SecAddr =
-      Writer->getSectionAddress(A_SD->getFragment()->getParent());
+  uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
   FixedValue += SecAddr;
   uint32_t Value2 = 0;
 
   if (const MCSymbolRefExpr *B = Target.getSymB()) {
-    const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+    const MCSymbol *SB = &B->getSymbol();
 
-    if (!B_SD->getFragment())
+    if (!SB->getFragment())
       report_fatal_error("symbol '" + B->getSymbol().getName() +
                          "' can not be undefined in a subtraction expression",
                          false);
@@ -380,10 +373,10 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
     // Note that there is no longer any semantic difference between these two
     // relocation types from the linkers point of view, this is done solely for
     // pedantic compatibility with 'as'.
-    Type = A_SD->isExternal() ? (unsigned)MachO::GENERIC_RELOC_SECTDIFF :
-      (unsigned)MachO::GENERIC_RELOC_LOCAL_SECTDIFF;
+    Type = A->isExternal() ? (unsigned)MachO::GENERIC_RELOC_SECTDIFF
+                           : (unsigned)MachO::GENERIC_RELOC_LOCAL_SECTDIFF;
     Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
-    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
+    FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
   }
 
   // Relocations are written out in reverse order, so the PAIR comes first.
@@ -435,7 +428,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
   return true;
 }
 
-void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
+void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer,
                                                const MCAssembler &Asm,
                                                const MCAsmLayout &Layout,
                                                const MCFragment *Fragment,
@@ -490,7 +483,7 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
   // If this is a 32-bit TLVP reloc it's handled a bit differently.
   if (Target.getSymA() &&
       Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP) {
-    RecordTLVPRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+    recordTLVPRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
                          FixedValue);
     return;
   }
@@ -499,7 +492,7 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
   // scattered relocation entry. Differences always require scattered
   // relocations.
   if (Target.getSymB()) {
-    RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+    recordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
                               Target, Log2Size, FixedValue);
     return;
   }
@@ -515,10 +508,10 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
   if (IsPCRel)
     Offset += 1 << Log2Size;
   // Try to record the scattered relocation if needed. Fall back to non
-  // scattered if necessary (see comments in RecordScatteredRelocation()
+  // scattered if necessary (see comments in recordScatteredRelocation()
   // for details).
   if (Offset && A && !Writer->doesSymbolRequireExternRelocation(*A) &&
-      RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+      recordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
                                 Log2Size, FixedValue))
     return;
 
@@ -538,7 +531,7 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
     // Resolve constant variables.
     if (A->isVariable()) {
       int64_t Res;
-      if (A->getVariableValue()->EvaluateAsAbsolute(
+      if (A->getVariableValue()->evaluateAsAbsolute(
               Res, Layout, Writer->getSectionAddressMap())) {
         FixedValue = Res;
         return;
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index c70e2e954631..852267400bba 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -168,6 +168,8 @@ def FeaturePRFCHW  : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
                                       "Support PRFCHW instructions">;
 def FeatureRDSEED  : SubtargetFeature<"rdseed", "HasRDSEED", "true",
                                       "Support RDSEED instruction">;
+def FeatureMPX     : SubtargetFeature<"mpx", "HasMPX", "true",
+                                      "Support MPX instructions">;
 def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
                                      "Use LEA for adjusting the stack pointer">;
 def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
@@ -188,10 +190,6 @@ def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
                                    "LEA instruction with certain arguments is slow">;
 def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
                                    "INC and DEC instructions are slower than ADD and SUB">;
-def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true",
-                            "Use RSQRT* to optimize square root calculations">;
-def FeatureUseRecipEst : SubtargetFeature<"use-recip-est", "UseReciprocalEst",
-                          "true", "Use RCP* to optimize division calculations">;
 def FeatureSoftFloat
     : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
                        "Use software floating point features.">;
@@ -380,7 +378,7 @@ class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel,
                       FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
                       FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI,
                       FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE,
-                      FeatureSlowIncDec]>;
+                      FeatureSlowIncDec, FeatureMPX]>;
 def : KnightsLandingProc<"knl">;
 
 // FIXME: define SKX model
@@ -391,7 +389,7 @@ class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel,
                       FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
                       FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI,
                       FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE,
-                      FeatureSlowIncDec]>;
+                      FeatureSlowIncDec, FeatureMPX]>;
 def : SkylakeProc<"skylake">;
 def : SkylakeProc<"skx">; // Legacy alias.
 
@@ -444,7 +442,7 @@ def : ProcessorModel<"btver2", BtVer2Model,
                       FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
                       FeatureBMI, FeatureF16C, FeatureMOVBE,
                       FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem,
-                      FeatureSlowSHLD, FeatureUseSqrtEst, FeatureUseRecipEst]>;
+                      FeatureSlowSHLD]>;
 
 // TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips.
 
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index f97557e5c609..64fc6d0d7e5c 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -78,7 +78,7 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
   switch (MO.getType()) {
   default: llvm_unreachable("unknown symbol type!");
   case MachineOperand::MO_ConstantPoolIndex:
-    O << *P.GetCPISymbol(MO.getIndex());
+    P.GetCPISymbol(MO.getIndex())->print(O, P.MAI);
     P.printOffset(MO.getOffset(), O);
     break;
   case MachineOperand::MO_GlobalAddress: {
@@ -127,9 +127,12 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
     // If the name begins with a dollar-sign, enclose it in parens.  We do this
     // to avoid having it look like an integer immediate to the assembler.
     if (GVSym->getName()[0] != '$')
-      O << *GVSym;
-    else
-      O << '(' << *GVSym << ')';
+      GVSym->print(O, P.MAI);
+    else {
+      O << '(';
+      GVSym->print(O, P.MAI);
+      O << ')';
+    }
     P.printOffset(MO.getOffset(), O);
     break;
   }
@@ -146,12 +149,15 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
     // These affect the name of the symbol, not any suffix.
     break;
   case X86II::MO_GOT_ABSOLUTE_ADDRESS:
-    O << " + [.-" << *P.MF->getPICBaseSymbol() << ']';
+    O << " + [.-";
+    P.MF->getPICBaseSymbol()->print(O, P.MAI);
+    O << ']';
     break;
   case X86II::MO_PIC_BASE_OFFSET:
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
   case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
-    O << '-' << *P.MF->getPICBaseSymbol();
+    O << '-';
+    P.MF->getPICBaseSymbol()->print(O, P.MAI);
     break;
   case X86II::MO_TLSGD:     O << "@TLSGD";     break;
   case X86II::MO_TLSLD:     O << "@TLSLD";     break;
@@ -168,7 +174,8 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
   case X86II::MO_PLT:       O << "@PLT";       break;
   case X86II::MO_TLVP:      O << "@TLVP";      break;
   case X86II::MO_TLVP_PIC_BASE:
-    O << "@TLVP" << '-' << *P.MF->getPICBaseSymbol();
+    O << "@TLVP" << '-';
+    P.MF->getPICBaseSymbol()->print(O, P.MAI);
     break;
   case X86II::MO_SECREL:    O << "@SECREL32";  break;
   }
@@ -525,7 +532,7 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
       // register any SEH handlers, so its object files should be safe.
       OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
       OutStreamer->EmitAssignment(
-          S, MCConstantExpr::Create(int64_t(1), MMI->getContext()));
+          S, MCConstantExpr::create(int64_t(1), MMI->getContext()));
     }
   }
 }
@@ -549,7 +556,7 @@ emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
     // using NLPs; however, sometimes the types are local to the file.
     // We need to fill in the value for the NLP in those cases.
     OutStreamer.EmitValue(
-        MCSymbolRefExpr::Create(MCSym.getPointer(), OutStreamer.getContext()),
+        MCSymbolRefExpr::create(MCSym.getPointer(), OutStreamer.getContext()),
         4 /*size*/);
 }
 
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 9af0aebea232..3dc75d76cee3 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -3530,9 +3530,9 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
   SmallVector<MachineOperand, 8> AddrOps;
   AM.getFullAddress(AddrOps);
 
-  MachineInstr *Result =
-    XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps,
-                              Size, Alignment, /*AllowCommute=*/true);
+  MachineInstr *Result = XII.foldMemoryOperandImpl(
+      *FuncInfo.MF, MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, Alignment,
+      /*AllowCommute=*/true);
   if (!Result)
     return false;
 
@@ -3541,20 +3541,21 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
   // to just look at OpNo + the offset to the index reg.  We actually need to
   // scan the instruction to find the index reg and see if its the correct reg
   // class.
-  for (MIOperands MO(Result); MO.isValid(); ++MO) {
-    if (!MO->isReg() || MO->isDef() || MO->getReg() != AM.IndexReg)
+  unsigned OperandNo = 0;
+  for (MachineInstr::mop_iterator I = Result->operands_begin(),
+       E = Result->operands_end(); I != E; ++I, ++OperandNo) {
+    MachineOperand &MO = *I;
+    if (!MO.isReg() || MO.isDef() || MO.getReg() != AM.IndexReg)
       continue;
     // Found the index reg, now try to rewrite it.
-    unsigned OpNo = MO.getOperandNo();
     unsigned IndexReg = constrainOperandRegClass(Result->getDesc(),
-                                                 MO->getReg(), OpNo);
-    if (IndexReg == MO->getReg())
+                                                 MO.getReg(), OperandNo);
+    if (IndexReg == MO.getReg())
       continue;
-    MO->setReg(IndexReg);
+    MO.setReg(IndexReg);
   }
 
   Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
-  FuncInfo.MBB->insert(FuncInfo.InsertPt, Result);
   MI->eraseFromParent();
   return true;
 }
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 3ba811574489..e3ec288a683e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -67,12 +67,6 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization(
              "rather than promotion."),
     cl::Hidden);
 
-static cl::opt<int> ReciprocalEstimateRefinementSteps(
-    "x86-recip-refinement-steps", cl::init(1),
-    cl::desc("Specify the number of Newton-Raphson iterations applied to the "
-             "result of the hardware reciprocal estimate instruction."),
-    cl::NotHidden);
-
 // Forward declarations.
 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
                        SDValue V2);
@@ -842,13 +836,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 
-    // Only provide customized ctpop vector bit twiddling for vector types we
-    // know to perform better than using the popcnt instructions on each vector
-    // element. If popcnt isn't supported, always provide the custom version.
-    if (!Subtarget->hasPOPCNT()) {
-      setOperationAction(ISD::CTPOP,            MVT::v4i32, Custom);
-      setOperationAction(ISD::CTPOP,            MVT::v2i64, Custom);
-    }
+    setOperationAction(ISD::CTPOP,              MVT::v16i8, Custom);
+    setOperationAction(ISD::CTPOP,              MVT::v8i16, Custom);
+    setOperationAction(ISD::CTPOP,              MVT::v4i32, Custom);
+    setOperationAction(ISD::CTPOP,              MVT::v2i64, Custom);
 
     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
@@ -1113,6 +1104,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
 
+    setOperationAction(ISD::CTPOP,             MVT::v32i8, Custom);
+    setOperationAction(ISD::CTPOP,             MVT::v16i16, Custom);
+    setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
+    setOperationAction(ISD::CTPOP,             MVT::v4i64, Custom);
+
     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
@@ -1147,16 +1143,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       // when we have a 256bit-wide blend with immediate.
       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
 
-      // Only provide customized ctpop vector bit twiddling for vector types we
-      // know to perform better than using the popcnt instructions on each
-      // vector element. If popcnt isn't supported, always provide the custom
-      // version.
-      if (!Subtarget->hasPOPCNT())
-        setOperationAction(ISD::CTPOP,           MVT::v4i64, Custom);
-
-      // Custom CTPOP always performs better on natively supported v8i32
-      setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
-
       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
       setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
@@ -1273,7 +1259,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64,  MVT::v8i16,  Legal);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64,  MVT::v8i32,  Legal);
     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64,  MVT::v8i32,  Legal);
-    
+
     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
@@ -1842,7 +1828,7 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
          Subtarget->isPICStyleGOT());
   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
   // entries.
-  return MCSymbolRefExpr::Create(MBB->getSymbol(),
+  return MCSymbolRefExpr::create(MBB->getSymbol(),
                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
 }
 
@@ -1866,7 +1852,7 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
 
   // Otherwise, the reference is relative to the PIC base.
-  return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
+  return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
 }
 
 std::pair<const TargetRegisterClass *, uint8_t>
@@ -1981,7 +1967,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
     }
     else if (VA.getLocInfo() == CCValAssign::BCvt)
-      ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
+      ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
 
     assert(VA.getLocInfo() != CCValAssign::FPExt &&
            "Unexpected FP-extend for return value.");
@@ -2018,13 +2004,13 @@ X86TargetLowering::LowerReturn(SDValue Chain,
     if (Subtarget->is64Bit()) {
       if (ValVT == MVT::x86mmx) {
         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
-          ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
+          ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
                                   ValToCopy);
           // If we don't have SSE2 available, convert to v4f32 so the generated
           // register is legal.
           if (!Subtarget->hasSSE2())
-            ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
+            ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
         }
       }
     }
@@ -2451,7 +2437,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
                                DAG.getValueType(VA.getValVT()));
       else if (VA.getLocInfo() == CCValAssign::BCvt)
-        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
+        ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
 
       if (VA.isExtInLoc()) {
         // Handle MMX values passed in XMM regs.
@@ -2780,6 +2766,19 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (MF.getTarget().Options.DisableTailCalls)
     isTailCall = false;
 
+  if (Subtarget->isPICStyleGOT() &&
+      !MF.getTarget().Options.GuaranteedTailCallOpt) {
+    // If we are using a GOT, disable tail calls to external symbols with
+    // default visibility. Tail calling such a symbol requires using a GOT
+    // relocation, which forces early binding of the symbol. This breaks code
+    // that require lazy function symbol resolution. Using musttail or
+    // GuaranteedTailCallOpt will override this.
+    GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+    if (!G || (!G->getGlobal()->hasLocalLinkage() &&
+               G->getGlobal()->hasDefaultVisibility()))
+      isTailCall = false;
+  }
+
   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
   if (IsMustTail) {
     // Force this to be a tail call.  The verifier rules are enough to ensure
@@ -2898,14 +2897,14 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
       else if (RegVT.is128BitVector()) {
         // Special case: passing MMX values in XMM registers.
-        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
+        Arg = DAG.getBitcast(MVT::i64, Arg);
         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
       } else
         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
       break;
     case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
+      Arg = DAG.getBitcast(RegVT, Arg);
       break;
     case CCValAssign::Indirect: {
       // Store the argument.
@@ -2964,8 +2963,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
       // Note: The actual moving to ECX is done further down.
       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
-      if (G && !G->getGlobal()->hasHiddenVisibility() &&
-          !G->getGlobal()->hasProtectedVisibility())
+      if (G && !G->getGlobal()->hasLocalLinkage() &&
+          G->getGlobal()->hasDefaultVisibility())
         Callee = LowerGlobalAddress(Callee, DAG);
       else if (isa<ExternalSymbolSDNode>(Callee))
         Callee = LowerExternalSymbol(Callee, DAG);
@@ -4073,7 +4072,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
   } else
     llvm_unreachable("Unexpected vector type");
 
-  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
+  return DAG.getBitcast(VT, Vec);
 }
 
 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
@@ -4200,9 +4199,9 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
     MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
 
     SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
-    Vec256 = DAG.getNode(ISD::BITCAST, dl, CastVT, Vec256);
+    Vec256 = DAG.getBitcast(CastVT, Vec256);
     Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
-    return DAG.getNode(ISD::BITCAST, dl, ResultVT, Vec256);
+    return DAG.getBitcast(ResultVT, Vec256);
   }
 
   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
@@ -4255,7 +4254,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
   } else
     llvm_unreachable("Unexpected vector type");
 
-  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
+  return DAG.getBitcast(VT, Vec);
 }
 
 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
@@ -4611,7 +4610,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
     }
   }
 
-  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
+  return DAG.getBitcast(MVT::v16i8, V);
 }
 
 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
@@ -4749,7 +4748,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
   SDLoc DL(Op);
   SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
                                DAG.getIntPtrConstant(InsertPSMask, DL));
-  return DAG.getNode(ISD::BITCAST, DL, VT, Result);
+  return DAG.getBitcast(VT, Result);
 }
 
 /// Return a vector logical shift node.
@@ -4759,12 +4758,11 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
   assert(VT.is128BitVector() && "Unknown type for VShift");
   MVT ShVT = MVT::v2i64;
   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
-  SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
+  SrcOp = DAG.getBitcast(ShVT, SrcOp);
   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
   SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
-  return DAG.getNode(ISD::BITCAST, dl, VT,
-                     DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
+  return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
 }
 
 static SDValue
@@ -4949,7 +4947,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
                              SDValue(ResNode.getNode(), 1));
     }
 
-    return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
+    return DAG.getBitcast(VT, ResNode);
   }
   return SDValue();
 }
@@ -5261,8 +5259,8 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
     SDValue Imm = ConvertI1VectorToInterger(Op, DAG);
     if (Imm.getValueSizeInBits() == VT.getSizeInBits())
-      return DAG.getNode(ISD::BITCAST, dl, VT, Imm);
-    SDValue ExtVec = DAG.getNode(ISD::BITCAST, dl, MVT::v8i1, Imm);
+      return DAG.getBitcast(VT, Imm);
+    SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
                         DAG.getIntPtrConstant(0, dl));
   }
@@ -5277,7 +5275,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
     SDValue In = Op.getOperand(idx);
     if (In.getOpcode() == ISD::UNDEF)
       continue;
-    if (!isa<ConstantSDNode>(In)) 
+    if (!isa<ConstantSDNode>(In))
       NonConstIdx.push_back(idx);
     else {
       Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
@@ -5304,12 +5302,12 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
   }
   else if (HasConstElts)
     Imm = DAG.getConstant(0, dl, VT);
-  else 
+  else
     Imm = DAG.getUNDEF(VT);
   if (Imm.getValueSizeInBits() == VT.getSizeInBits())
-    DstVec = DAG.getNode(ISD::BITCAST, dl, VT, Imm);
+    DstVec = DAG.getBitcast(VT, Imm);
   else {
-    SDValue ExtVec = DAG.getNode(ISD::BITCAST, dl, MVT::v8i1, Imm);
+    SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
     DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
                          DAG.getIntPtrConstant(0, dl));
   }
@@ -5818,9 +5816,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
         // convert it to a vector with movd (S2V+shuffle to zero extend).
         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
-        return DAG.getNode(
-            ISD::BITCAST, dl, VT,
-            getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
+        return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
+                                      Item, Idx * 2, true, Subtarget, DAG));
       }
     }
 
@@ -5866,7 +5863,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
           Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
         }
-        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
+        return DAG.getBitcast(VT, Item);
       }
     }
 
@@ -6257,6 +6254,42 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
   return true;
 }
 
+/// \brief Test whether a shuffle mask is equivalent within each 256-bit lane.
+///
+/// This checks a shuffle mask to see if it is performing the same
+/// 256-bit lane-relative shuffle in each 256-bit lane. This trivially implies
+/// that it is also not lane-crossing. It may however involve a blend from the
+/// same lane of a second vector.
+///
+/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
+/// non-trivial to compute in the face of undef lanes. The representation is
+/// *not* suitable for use with existing 256-bit shuffles as it will contain
+/// entries from both V1 and V2 inputs to the wider mask.
+static bool
+is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+                                SmallVectorImpl<int> &RepeatedMask) {
+  int LaneSize = 256 / VT.getScalarSizeInBits();
+  RepeatedMask.resize(LaneSize, -1);
+  int Size = Mask.size();
+  for (int i = 0; i < Size; ++i) {
+    if (Mask[i] < 0)
+      continue;
+    if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+      // This entry crosses lanes, so there is no way to model this shuffle.
+      return false;
+
+    // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+    if (RepeatedMask[i % LaneSize] == -1)
+      // This is the first non-undef entry in this slot of a 256-bit lane.
+      RepeatedMask[i % LaneSize] =
+          Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
+    else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
+      // Found a mismatch with the repeated mask.
+      return false;
+  }
+  return true;
+}
+
 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
 /// arguments.
 ///
@@ -6316,6 +6349,22 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
   return DAG.getConstant(Imm, DL, MVT::i8);
 }
 
+/// \brief Get a 8-bit shuffle, 1 bit per lane, immediate for a mask.
+///
+/// This helper function produces an 8-bit shuffle immediate corresponding to
+/// the ubiquitous shuffle encoding scheme used in x86 instructions for
+/// shuffling 8 lanes. 
+static SDValue get1bitLaneShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
+                                             SelectionDAG &DAG) {
+  assert(Mask.size() <= 8 &&
+         "Up to 8 elts may be in Imm8 1-bit lane shuffle mask");
+  unsigned Imm = 0;
+  for (unsigned i = 0; i < Mask.size(); ++i)
+    if (Mask[i] >= 0)
+      Imm |= (Mask[i] % 2) << i;
+  return DAG.getConstant(Imm, DL, MVT::i8);
+}
+
 /// \brief Try to emit a blend instruction for a shuffle using bit math.
 ///
 /// This is used as a fallback approach when first class blend instructions are
@@ -6341,10 +6390,9 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
   // We have to cast V2 around.
   MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
-  V2 = DAG.getNode(ISD::BITCAST, DL, VT,
-                   DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
-                               DAG.getNode(ISD::BITCAST, DL, MaskVT, V1Mask),
-                               DAG.getNode(ISD::BITCAST, DL, MaskVT, V2)));
+  V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
+                                      DAG.getBitcast(MaskVT, V1Mask),
+                                      DAG.getBitcast(MaskVT, V2)));
   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
 }
 
@@ -6395,11 +6443,11 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
             BlendMask |= 1u << (i * Scale + j);
 
       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
-      V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
-      V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
-      return DAG.getNode(ISD::BITCAST, DL, VT,
-                         DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
-                                     DAG.getConstant(BlendMask, DL, MVT::i8)));
+      V1 = DAG.getBitcast(BlendVT, V1);
+      V2 = DAG.getBitcast(BlendVT, V2);
+      return DAG.getBitcast(
+          VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
+                          DAG.getConstant(BlendMask, DL, MVT::i8)));
     }
     // FALLTHROUGH
   case MVT::v8i16: {
@@ -6412,11 +6460,11 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
         for (int j = 0; j < Scale; ++j)
           BlendMask |= 1u << (i * Scale + j);
 
-    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
-    V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
-    return DAG.getNode(ISD::BITCAST, DL, VT,
-                       DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
-                                   DAG.getConstant(BlendMask, DL, MVT::i8)));
+    V1 = DAG.getBitcast(MVT::v8i16, V1);
+    V2 = DAG.getBitcast(MVT::v8i16, V2);
+    return DAG.getBitcast(VT,
+                          DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
+                                      DAG.getConstant(BlendMask, DL, MVT::i8)));
   }
 
   case MVT::v16i16: {
@@ -6465,13 +6513,12 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
                                           MVT::i8));
 
-    V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
-    V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
-    return DAG.getNode(
-        ISD::BITCAST, DL, VT,
-        DAG.getNode(ISD::VSELECT, DL, BlendVT,
-                    DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask),
-                    V1, V2));
+    V1 = DAG.getBitcast(BlendVT, V1);
+    V2 = DAG.getBitcast(BlendVT, V2);
+    return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
+                                          DAG.getNode(ISD::BUILD_VECTOR, DL,
+                                                      BlendVT, VSELECTMask),
+                                          V1, V2));
   }
 
   default:
@@ -6652,13 +6699,12 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
   if (Subtarget->hasSSSE3()) {
     // Cast the inputs to i8 vector of correct length to match PALIGNR.
     MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
-    Lo = DAG.getNode(ISD::BITCAST, DL, AlignVT, Lo);
-    Hi = DAG.getNode(ISD::BITCAST, DL, AlignVT, Hi);
+    Lo = DAG.getBitcast(AlignVT, Lo);
+    Hi = DAG.getBitcast(AlignVT, Hi);
 
-    return DAG.getNode(ISD::BITCAST, DL, VT,
-                       DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo,
-                                   DAG.getConstant(Rotation * Scale, DL,
-                                                   MVT::i8)));
+    return DAG.getBitcast(
+        VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo,
+                        DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
   }
 
   assert(VT.getSizeInBits() == 128 &&
@@ -6671,15 +6717,15 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
   int HiByteShift = Rotation * Scale;
 
   // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
-  Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
-  Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
+  Lo = DAG.getBitcast(MVT::v2i64, Lo);
+  Hi = DAG.getBitcast(MVT::v2i64, Hi);
 
   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
                                 DAG.getConstant(LoByteShift, DL, MVT::i8));
   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
                                 DAG.getConstant(HiByteShift, DL, MVT::i8));
-  return DAG.getNode(ISD::BITCAST, DL, VT,
-                     DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
+  return DAG.getBitcast(VT,
+                        DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
 }
 
 /// \brief Compute whether each element of a shuffle is zeroable.
@@ -6740,8 +6786,8 @@ static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
                                     IntEltVT);
   if (EltVT.isFloatingPoint()) {
-    Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
-    AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
+    Zero = DAG.getBitcast(EltVT, Zero);
+    AllOnes = DAG.getBitcast(EltVT, AllOnes);
   }
   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
@@ -6833,11 +6879,11 @@ static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
     MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
     assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
            "Illegal integer vector type");
-    V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
+    V = DAG.getBitcast(ShiftVT, V);
 
     V = DAG.getNode(OpCode, DL, ShiftVT, V,
                     DAG.getConstant(ShiftAmt, DL, MVT::i8));
-    return DAG.getNode(ISD::BITCAST, DL, VT, V);
+    return DAG.getBitcast(VT, V);
   };
 
   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
@@ -6878,31 +6924,28 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
   if (Subtarget->hasSSE41()) {
     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
                                  NumElements / Scale);
-    return DAG.getNode(ISD::BITCAST, DL, VT,
-                       DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
+    return DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
   }
 
   // For any extends we can cheat for larger element sizes and use shuffle
   // instructions that can fold with a load and/or copy.
   if (AnyExt && EltBits == 32) {
     int PSHUFDMask[4] = {0, -1, 1, -1};
-    return DAG.getNode(
-        ISD::BITCAST, DL, VT,
-        DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
-                    DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
-                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
+    return DAG.getBitcast(
+        VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+                        DAG.getBitcast(MVT::v4i32, InputV),
+                        getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
   }
   if (AnyExt && EltBits == 16 && Scale > 2) {
     int PSHUFDMask[4] = {0, -1, 0, -1};
     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
-                         DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
+                         DAG.getBitcast(MVT::v4i32, InputV),
                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
     int PSHUFHWMask[4] = {1, -1, -1, -1};
-    return DAG.getNode(
-        ISD::BITCAST, DL, VT,
-        DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
-                    DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
-                    getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG)));
+    return DAG.getBitcast(
+        VT, DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
+                        DAG.getBitcast(MVT::v8i16, InputV),
+                        getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG)));
   }
 
   // If this would require more than 2 unpack instructions to expand, use
@@ -6914,11 +6957,11 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
     for (int i = 0; i < 16; ++i)
       PSHUFBMask[i] =
           DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, DL, MVT::i8);
-    InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
-    return DAG.getNode(ISD::BITCAST, DL, VT,
-                       DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
-                                   DAG.getNode(ISD::BUILD_VECTOR, DL,
-                                               MVT::v16i8, PSHUFBMask)));
+    InputV = DAG.getBitcast(MVT::v16i8, InputV);
+    return DAG.getBitcast(VT,
+                          DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
+                                      DAG.getNode(ISD::BUILD_VECTOR, DL,
+                                                  MVT::v16i8, PSHUFBMask)));
   }
 
   // Otherwise emit a sequence of unpacks.
@@ -6926,13 +6969,13 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
                          : getZeroVector(InputVT, Subtarget, DAG, DL);
-    InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
+    InputV = DAG.getBitcast(InputVT, InputV);
     InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
     Scale /= 2;
     EltBits *= 2;
     NumElements /= 2;
   } while (Scale > 1);
-  return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
+  return DAG.getBitcast(VT, InputV);
 }
 
 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
@@ -7030,9 +7073,9 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
   };
 
   if (SDValue V = CanZExtLowHalf()) {
-    V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V);
+    V = DAG.getBitcast(MVT::v2i64, V);
     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
-    return DAG.getNode(ISD::BITCAST, DL, VT, V);
+    return DAG.getBitcast(VT, V);
   }
 
   // No viable ext lowering found.
@@ -7106,7 +7149,7 @@ static SDValue lowerVectorShuffleAsElementInsertion(
   if (SDValue V2S = getScalarValueForVectorElement(
           V2, Mask[V2Index] - Mask.size(), DAG)) {
     // We need to zext the scalar if it is smaller than an i32.
-    V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
+    V2S = DAG.getBitcast(EltVT, V2S);
     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
       // Using zext to expand a narrow element won't work for non-zero
       // insertions.
@@ -7155,7 +7198,7 @@ static SDValue lowerVectorShuffleAsElementInsertion(
 
   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
   if (ExtVT != VT)
-    V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
+    V2 = DAG.getBitcast(VT, V2);
 
   if (V2Index != 0) {
     // If we have 4 or fewer lanes we can cheaply shuffle the element into
@@ -7167,13 +7210,13 @@ static SDValue lowerVectorShuffleAsElementInsertion(
       V2Shuffle[V2Index] = 0;
       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
     } else {
-      V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
+      V2 = DAG.getBitcast(MVT::v2i64, V2);
       V2 = DAG.getNode(
           X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
           DAG.getConstant(
               V2Index * EltVT.getSizeInBits()/8, DL,
               DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
-      V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
+      V2 = DAG.getBitcast(VT, V2);
     }
   }
   return V2;
@@ -7396,13 +7439,13 @@ static SDValue lowerVectorShuffleAsUnpack(SDLoc DL, MVT VT, SDValue V1,
     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
 
     // Cast the inputs to the type we will use to unpack them.
-    V1 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V1);
-    V2 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V2);
+    V1 = DAG.getBitcast(UnpackVT, V1);
+    V2 = DAG.getBitcast(UnpackVT, V2);
 
     // Unpack the inputs and cast the result back to the desired type.
-    return DAG.getNode(ISD::BITCAST, DL, VT,
-                       DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
-                                   DL, UnpackVT, V1, V2));
+    return DAG.getBitcast(
+        VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+                        UnpackVT, V1, V2));
   };
 
   // We try each unpack from the largest to the smallest to try and find one
@@ -7558,12 +7601,12 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     // Straight shuffle of a single input vector. For everything from SSE2
     // onward this has a single fast instruction with no scary immediates.
     // We have to map the mask as it is actually a v4i32 shuffle instruction.
-    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
+    V1 = DAG.getBitcast(MVT::v4i32, V1);
     int WidenedMask[4] = {
         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
-    return DAG.getNode(
-        ISD::BITCAST, DL, MVT::v2i64,
+    return DAG.getBitcast(
+        MVT::v2i64,
         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
                     getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
   }
@@ -7584,12 +7627,12 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   };
   if (SDValue V1Pack = GetPackNode(V1))
     if (SDValue V2Pack = GetPackNode(V2))
-      return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
-                         DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
-                                     Mask[0] == 0 ? V1Pack.getOperand(0)
-                                                  : V1Pack.getOperand(1),
-                                     Mask[1] == 2 ? V2Pack.getOperand(0)
-                                                  : V2Pack.getOperand(1)));
+      return DAG.getBitcast(MVT::v2i64,
+                            DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
+                                        Mask[0] == 0 ? V1Pack.getOperand(0)
+                                                     : V1Pack.getOperand(1),
+                                        Mask[1] == 2 ? V2Pack.getOperand(0)
+                                                     : V2Pack.getOperand(1)));
 
   // Try to use shift instructions.
   if (SDValue Shift =
@@ -7639,10 +7682,10 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
   // However, all the alternatives are still more cycles and newer chips don't
   // have this problem. It would be really nice if x86 had better shuffles here.
-  V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
-  V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
-  return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
-                     DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
+  V1 = DAG.getBitcast(MVT::v2f64, V1);
+  V2 = DAG.getBitcast(MVT::v2f64, V2);
+  return DAG.getBitcast(MVT::v2i64,
+                        DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
 }
 
 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
@@ -7941,11 +7984,10 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   // up the inputs, bypassing domain shift penalties that we would encur if we
   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
   // relevant.
-  return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
-                     DAG.getVectorShuffle(
-                         MVT::v4f32, DL,
-                         DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
-                         DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
+  return DAG.getBitcast(
+      MVT::v4i32,
+      DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1),
+                           DAG.getBitcast(MVT::v4f32, V2), Mask));
 }
 
 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
@@ -8123,11 +8165,10 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
     int PSHUFDMask[] = {0, 1, 2, 3};
     PSHUFDMask[ADWord] = BDWord;
     PSHUFDMask[BDWord] = ADWord;
-    V = DAG.getNode(ISD::BITCAST, DL, VT,
-                    DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT,
-                                DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V),
-                                getV4X86ShuffleImm8ForMask(PSHUFDMask, DL,
-                                                           DAG)));
+    V = DAG.getBitcast(
+        VT,
+        DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
+                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
 
     // Adjust the mask to match the new locations of A and B.
     for (int &M : Mask)
@@ -8368,11 +8409,10 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
   if (!isNoopShuffleMask(PSHUFDMask))
-    V = DAG.getNode(ISD::BITCAST, DL, VT,
-                    DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT,
-                                DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V),
-                                getV4X86ShuffleImm8ForMask(PSHUFDMask, DL,
-                                                           DAG)));
+    V = DAG.getBitcast(
+        VT,
+        DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
+                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
 
   // At this point, each half should contain all its inputs, and we can then
   // just shuffle them into their final position.
@@ -8433,11 +8473,11 @@ static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
 
   if (V1InUse)
     V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
-                     DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V1),
+                     DAG.getBitcast(MVT::v16i8, V1),
                      DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
   if (V2InUse)
     V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
-                     DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V2),
+                     DAG.getBitcast(MVT::v16i8, V2),
                      DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
 
   // If we need shuffled inputs from both, blend the two.
@@ -8448,7 +8488,7 @@ static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
     V = V1InUse ? V1 : V2;
 
   // Cast the result back to the correct type.
-  return DAG.getNode(ISD::BITCAST, DL, VT, V);
+  return DAG.getBitcast(VT, V);
 }
 
 /// \brief Generic lowering of 8-lane i16 shuffles.
@@ -8749,10 +8789,9 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
         // Update the lane map based on the mapping we ended up with.
         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
       }
-      V1 = DAG.getNode(
-          ISD::BITCAST, DL, MVT::v16i8,
-          DAG.getVectorShuffle(MVT::v8i16, DL,
-                               DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+      V1 = DAG.getBitcast(
+          MVT::v16i8,
+          DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
 
       // Unpack the bytes to form the i16s that will be shuffled into place.
@@ -8770,10 +8809,9 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
                    "Conflicting entrties in the original shuffle!");
         }
-      return DAG.getNode(
-          ISD::BITCAST, DL, MVT::v16i8,
-          DAG.getVectorShuffle(MVT::v8i16, DL,
-                               DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+      return DAG.getBitcast(
+          MVT::v16i8,
+          DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
     };
     if (SDValue V = tryToWidenViaDuplication())
@@ -8866,19 +8904,18 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     // We use the mask type to pick which bytes are preserved based on how many
     // elements are dropped.
     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
-    SDValue ByteClearMask =
-        DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
-                    DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
+    SDValue ByteClearMask = DAG.getBitcast(
+        MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
     if (!IsSingleInput)
       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
 
     // Now pack things back together.
-    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
-    V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
+    V1 = DAG.getBitcast(MVT::v8i16, V1);
+    V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
     for (int i = 1; i < NumEvenDrops; ++i) {
-      Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
+      Result = DAG.getBitcast(MVT::v8i16, Result);
       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
     }
 
@@ -8912,7 +8949,7 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
       std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
                    [](int M) { return M >= 0 && M % 2 == 1; })) {
     // Use a mask to drop the high bytes.
-    VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+    VLoHalf = DAG.getBitcast(MVT::v8i16, V);
     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
                      DAG.getConstant(0x00FF, DL, MVT::v8i16));
 
@@ -8929,10 +8966,10 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   } else {
     // Otherwise just unpack the low half of V into VLoHalf and the high half into
     // VHiHalf so that we can blend them as i16s.
-    VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
-                     DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
-    VHiHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
-                     DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
+    VLoHalf = DAG.getBitcast(
+        MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
+    VHiHalf = DAG.getBitcast(
+        MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
   }
 
   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
@@ -9073,8 +9110,8 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
       LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps);
       HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps);
     }
-    return std::make_pair(DAG.getNode(ISD::BITCAST, DL, SplitVT, LoV),
-                          DAG.getNode(ISD::BITCAST, DL, SplitVT, HiV));
+    return std::make_pair(DAG.getBitcast(SplitVT, LoV),
+                          DAG.getBitcast(SplitVT, HiV));
   };
 
   SDValue LoV1, HiV1, LoV2, HiV2;
@@ -9407,12 +9444,12 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
     }
 
-  V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
-  V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
+  V1 = DAG.getBitcast(LaneVT, V1);
+  V2 = DAG.getBitcast(LaneVT, V2);
   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
 
   // Cast it back to the type we actually want.
-  LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
+  LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
 
   // Now do a simple shuffle that isn't lane crossing.
   SmallVector<int, 8> NewMask;
@@ -9441,6 +9478,37 @@ static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
   return true;
 }
 
+static SDValue lowerVectorShuffleWithSHUFPD(SDLoc DL, MVT VT,
+                                            ArrayRef<int> Mask, SDValue V1,
+                                            SDValue V2, SelectionDAG &DAG) {
+
+  // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
+  // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
+  assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD");
+  int NumElts = VT.getVectorNumElements();
+  bool ShufpdMask = true;
+  bool CommutableMask = true;
+  unsigned Immediate = 0;
+  for (int i = 0; i < NumElts; ++i) {
+    if (Mask[i] < 0)
+      continue;
+    int Val = (i & 6) + NumElts * (i & 1);
+    int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1);
+    if (Mask[i] < Val ||  Mask[i] > Val + 1)
+      ShufpdMask = false;
+    if (Mask[i] < CommutVal ||  Mask[i] > CommutVal + 1)
+      CommutableMask = false;
+    Immediate |= (Mask[i] % 2) << i;
+  }
+  if (ShufpdMask)
+    return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
+                       DAG.getConstant(Immediate, DL, MVT::i8));
+  if (CommutableMask)
+    return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
+                       DAG.getConstant(Immediate, DL, MVT::i8));
+  return SDValue();
+}
+
 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
 ///
 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
@@ -9505,24 +9573,9 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
     return Blend;
 
   // Check if the blend happens to exactly fit that of SHUFPD.
-  if ((Mask[0] == -1 || Mask[0] < 2) &&
-      (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
-      (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
-      (Mask[3] == -1 || Mask[3] >= 6)) {
-    unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
-                          ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
-    return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
-                       DAG.getConstant(SHUFPDMask, DL, MVT::i8));
-  }
-  if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
-      (Mask[1] == -1 || Mask[1] < 2) &&
-      (Mask[2] == -1 || Mask[2] >= 6) &&
-      (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
-    unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
-                          ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
-    return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
-                       DAG.getConstant(SHUFPDMask, DL, MVT::i8));
-  }
+  if (SDValue Op =
+      lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
+    return Op;
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle. However, if we have AVX2 and either inputs are already in place,
@@ -9584,10 +9637,10 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
           PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
           PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
         }
-      return DAG.getNode(
-          ISD::BITCAST, DL, MVT::v4i64,
+      return DAG.getBitcast(
+          MVT::v4i64,
           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
-                      DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
+                      DAG.getBitcast(MVT::v8i32, V1),
                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
     }
   }
@@ -9700,11 +9753,11 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
           DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
 
     if (Subtarget->hasAVX2())
-      return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
-                         DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
-                                     DAG.getNode(ISD::BUILD_VECTOR, DL,
+      return DAG.getNode(
+          X86ISD::VPERMV, DL, MVT::v8f32,
+          DAG.getBitcast(MVT::v8f32, DAG.getNode(ISD::BUILD_VECTOR, DL,
                                                  MVT::v8i32, VPermMask)),
-                         V1);
+          V1);
 
     // Otherwise, fall back.
     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
@@ -9894,12 +9947,11 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
       PSHUFBMask[2 * i] = DAG.getConstant(2 * M, DL, MVT::i8);
       PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, DL, MVT::i8);
     }
-    return DAG.getNode(
-        ISD::BITCAST, DL, MVT::v16i16,
-        DAG.getNode(
-            X86ISD::PSHUFB, DL, MVT::v32i8,
-            DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
-            DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
+    return DAG.getBitcast(MVT::v16i16,
+                          DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8,
+                                      DAG.getBitcast(MVT::v32i8, V1),
+                                      DAG.getNode(ISD::BUILD_VECTOR, DL,
+                                                  MVT::v32i8, PSHUFBMask)));
   }
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
@@ -10039,10 +10091,9 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
 
     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
                                 VT.getVectorNumElements());
-    V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
-    V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
-    return DAG.getNode(ISD::BITCAST, DL, VT,
-                       DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
+    V1 = DAG.getBitcast(FpVT, V1);
+    V2 = DAG.getBitcast(FpVT, V2);
+    return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
   }
 
   switch (VT.SimpleTy) {
@@ -10064,64 +10115,60 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   }
 }
 
-/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
-static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
-                                       SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
-  assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
-  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
-
-  // X86 has dedicated unpack instructions that can handle specific blend
-  // operations: UNPCKH and UNPCKL.
-  if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
+static SDValue lowerVectorShuffleWithVALIGN(SDLoc DL, MVT VT,
+                                            ArrayRef<int> Mask, SDValue V1,
+                                            SDValue V2, SelectionDAG &DAG) {
 
-  // FIXME: Implement direct support for this type!
-  return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
+  assert(VT.getScalarSizeInBits() >= 32 && "Unexpected data type for VALIGN");
+  // VALIGN pattern 2, 3, 4, 5, .. (sequential, shifted right)
+  int AlignVal = -1;
+  for (int i = 0; i < (signed)VT.getVectorNumElements(); ++i) {
+    if (Mask[i] < 0)
+      continue;
+    if (Mask[i] < i)
+      return SDValue();
+    if (AlignVal == -1)
+      AlignVal = Mask[i] - i;
+    else if (Mask[i] - i != AlignVal)
+      return SDValue();
+  }
+  // Vector source operands should be swapped
+  return DAG.getNode(X86ISD::VALIGN, DL, VT, V2, V1,
+                     DAG.getConstant(AlignVal, DL, MVT::i8));
 }
 
-/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
-static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
-                                       const X86Subtarget *Subtarget,
-                                       SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
-  assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  ArrayRef<int> Mask = SVOp->getMask();
-  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT,
+                                           ArrayRef<int> Mask, SDValue V1,
+                                           SDValue V2, SelectionDAG &DAG) {
 
-  // Use dedicated unpack instructions for masks that match their pattern.
-  if (isShuffleEquivalent(V1, V2, Mask,
-                          {// First 128-bit lane.
-                           0, 16, 1, 17, 4, 20, 5, 21,
-                           // Second 128-bit lane.
-                           8, 24, 9, 25, 12, 28, 13, 29}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask,
-                          {// First 128-bit lane.
-                           2, 18, 3, 19, 6, 22, 7, 23,
-                           // Second 128-bit lane.
-                           10, 26, 11, 27, 14, 30, 15, 31}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
+  assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV");
 
-  // FIXME: Implement direct support for this type!
-  return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
+  MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
+  MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
+
+  SmallVector<SDValue, 32>  VPermMask;
+  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i)
+    VPermMask.push_back(Mask[i] < 0 ? DAG.getUNDEF(MaskEltVT) :
+                        DAG.getConstant(Mask[i], DL,MaskEltVT));
+  SDValue MaskNode = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecVT,
+                                 VPermMask);
+  if (isSingleInputShuffleMask(Mask))
+    return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
+
+  return DAG.getNode(X86ISD::VPERMV3, DL, VT, MaskNode, V1, V2);
 }
 
-/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
-static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+
+/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
+static SDValue lowerV8X64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                        const X86Subtarget *Subtarget,
                                        SelectionDAG &DAG) {
   SDLoc DL(Op);
-  assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
-  assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
+  MVT VT = Op.getSimpleValueType();
+  assert((V1.getSimpleValueType() == MVT::v8f64 ||
+          V1.getSimpleValueType() == MVT::v8i64) && "Bad operand type!");
+  assert((V2.getSimpleValueType() == MVT::v8f64 ||
+          V2.getSimpleValueType() == MVT::v8i64) && "Bad operand type!");
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
@@ -10129,21 +10176,40 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   // X86 has dedicated unpack instructions that can handle specific blend
   // operations: UNPCKH and UNPCKL.
   if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
+    return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
   if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
+    return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
 
-  // FIXME: Implement direct support for this type!
-  return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
+  if (SDValue Op = lowerVectorShuffleWithVALIGN(DL, VT, Mask, V1, V2, DAG))
+    return Op;
+
+  if (SDValue Op = lowerVectorShuffleWithSHUFPD(DL, VT, Mask, V1, V2, DAG))
+    return Op;
+
+  // PERMILPD instruction - mask 0/1, 0/1, 2/3, 2/3, 4/5, 4/5, 6/7, 6/7
+  if (isSingleInputShuffleMask(Mask)) {
+    if (!is128BitLaneCrossingShuffleMask(VT, Mask))
+      return DAG.getNode(X86ISD::VPERMILPI, DL, VT, V1,
+                         get1bitLaneShuffleImm8ForMask(Mask, DL, DAG));
+
+    SmallVector<int, 4> RepeatedMask;
+    if (is256BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
+      return DAG.getNode(X86ISD::VPERMI, DL, VT, V1,
+                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+  }
+  return lowerVectorShuffleWithPERMV(DL, VT, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
-static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+static SDValue lowerV16X32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                        const X86Subtarget *Subtarget,
                                        SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
   SDLoc DL(Op);
-  assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
-  assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
+  assert((V1.getSimpleValueType() == MVT::v16i32 ||
+          V1.getSimpleValueType() == MVT::v16f32) && "Bad operand type!");
+  assert((V2.getSimpleValueType() == MVT::v16i32 ||
+          V2.getSimpleValueType() == MVT::v16f32) && "Bad operand type!");
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
@@ -10154,16 +10220,39 @@ static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                            0, 16, 1, 17, 4, 20, 5, 21,
                            // Second 128-bit lane.
                            8, 24, 9, 25, 12, 28, 13, 29}))
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
+    return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
   if (isShuffleEquivalent(V1, V2, Mask,
                           {// First 128-bit lane.
                            2, 18, 3, 19, 6, 22, 7, 23,
                            // Second 128-bit lane.
                            10, 26, 11, 27, 14, 30, 15, 31}))
-    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
+    return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
 
-  // FIXME: Implement direct support for this type!
-  return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
+  if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
+                                         12, 12, 14, 14}))
+    return DAG.getNode(X86ISD::MOVSLDUP, DL, VT, V1);
+  if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11,
+                                         13, 13, 15, 15}))
+    return DAG.getNode(X86ISD::MOVSHDUP, DL, VT, V1);
+
+  SmallVector<int, 4> RepeatedMask;
+  if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) {
+    if (isSingleInputShuffleMask(Mask)) {
+      unsigned Opc = VT.isInteger() ? X86ISD::PSHUFD : X86ISD::VPERMILPI;
+      return DAG.getNode(Opc, DL, VT, V1,
+                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+    }
+
+    for (int i = 0; i < 4; ++i)
+      if (RepeatedMask[i] >= 16)
+        RepeatedMask[i] -= 12;
+     return lowerVectorShuffleWithSHUFPS(DL, VT, RepeatedMask, V1, V2, DAG);
+  }
+
+  if (SDValue Op = lowerVectorShuffleWithVALIGN(DL, VT, Mask, V1, V2, DAG))
+    return Op;
+
+  return lowerVectorShuffleWithPERMV(DL, VT, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
@@ -10223,13 +10312,11 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   // the requisite ISA extensions for that element type are available.
   switch (VT.SimpleTy) {
   case MVT::v8f64:
-    return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
-  case MVT::v16f32:
-    return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
   case MVT::v8i64:
-    return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    return lowerV8X64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v16f32:
   case MVT::v16i32:
-    return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    return lowerV16X32VectorShuffle(Op, V1, V2, Subtarget, DAG);
   case MVT::v32i16:
     if (Subtarget->hasBWI())
       return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
@@ -10311,10 +10398,10 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
     // Make sure that the new vector type is legal. For example, v2f64 isn't
     // legal on SSE1.
     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
-      V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
-      V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
-      return DAG.getNode(ISD::BITCAST, dl, VT,
-                         DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
+      V1 = DAG.getBitcast(NewVT, V1);
+      V2 = DAG.getBitcast(NewVT, V2);
+      return DAG.getBitcast(
+          VT, DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
     }
   }
 
@@ -10509,12 +10596,11 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
     if (Idx == 0)
-      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
-                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                                     DAG.getNode(ISD::BITCAST, dl,
-                                                 MVT::v4i32,
-                                                 Op.getOperand(0)),
-                                     Op.getOperand(1)));
+      return DAG.getNode(
+          ISD::TRUNCATE, dl, MVT::i16,
+          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                      DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
+                      Op.getOperand(1)));
     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
                                   Op.getOperand(0), Op.getOperand(1));
     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
@@ -10538,10 +10624,9 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
          User->getValueType(0) != MVT::i32))
       return SDValue();
     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                                  DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
-                                              Op.getOperand(0)),
-                                              Op.getOperand(1));
-    return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
+                                  DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
+                                  Op.getOperand(1));
+    return DAG.getBitcast(MVT::f32, Extract);
   }
 
   if (VT == MVT::i32 || VT == MVT::i64) {
@@ -10655,8 +10740,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     if (Idx == 0)
       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                                     DAG.getNode(ISD::BITCAST, dl,
-                                                 MVT::v4i32, Vec),
+                                     DAG.getBitcast(MVT::v4i32, Vec),
                                      Op.getOperand(1)));
     // Transform it so it match pextrw which produces a 32-bit result.
     MVT EltVT = MVT::i32;
@@ -10877,8 +10961,8 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
 
   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
   assert(OpVT.is128BitVector() && "Expected an SSE type!");
-  return DAG.getNode(ISD::BITCAST, dl, OpVT,
-                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
+  return DAG.getBitcast(
+      OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
 }
 
 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
@@ -11670,14 +11754,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
   SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
                               MachinePointerInfo::getConstantPool(),
                               false, false, false, 16);
-  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
-                              DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
-                              CLod0);
+  SDValue Unpck1 =
+      getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
 
   SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
                               MachinePointerInfo::getConstantPool(),
                               false, false, false, 16);
-  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
+  SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
   SDValue Result;
 
@@ -11685,12 +11768,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
   } else {
-    SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
+    SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
                                            S2F, 0x4E, DAG);
     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
-                         DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
-                         Sub);
+                         DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
   }
 
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
@@ -11713,20 +11795,19 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
 
   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
-                     DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
+                     DAG.getBitcast(MVT::v2f64, Load),
                      DAG.getIntPtrConstant(0, dl));
 
   // Or the load with the bias.
-  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
-                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
-                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
-                                                   MVT::v2f64, Load)),
-                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
-                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
-                                                   MVT::v2f64, Bias)));
-  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
-                   DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
-                   DAG.getIntPtrConstant(0, dl));
+  SDValue Or = DAG.getNode(
+      ISD::OR, dl, MVT::v2i64,
+      DAG.getBitcast(MVT::v2i64,
+                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
+      DAG.getBitcast(MVT::v2i64,
+                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
+  Or =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                  DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
 
   // Subtract the bias.
   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
@@ -11805,19 +11886,16 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
   if (Subtarget.hasSSE41()) {
     EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
-    SDValue VecCstLowBitcast =
-        DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);
-    SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);
+    SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
+    SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
     // Low will be bitcasted right away, so do not bother bitcasting back to its
     // original type.
     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
                       VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
     //                                 (uint4) 0x53000000, 0xaa);
-    SDValue VecCstHighBitcast =
-        DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);
-    SDValue VecShiftBitcast =
-        DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);
+    SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
+    SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
     // High will be bitcasted right away, so do not bother bitcasting back to
     // its original type.
     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
@@ -11843,11 +11921,11 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
                                    makeArrayRef(&CstFAddArray[0], NumElts));
 
   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
-  SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);
+  SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
   SDValue FHigh =
       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
   //     return (float4) lo + fhi;
-  SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);
+  SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
 }
 
@@ -12103,8 +12181,8 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
                              VT.getVectorNumElements()/2);
 
-  OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
-  OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+  OpLo = DAG.getBitcast(HVT, OpLo);
+  OpHi = DAG.getBitcast(HVT, OpHi);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
@@ -12189,14 +12267,14 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
     if (InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 &&
         Subtarget->hasBWI())
       return Op; // legal, will go to VPMOVB2M, VPMOVW2M
-    if ((InVT.is256BitVector() || InVT.is128BitVector()) 
+    if ((InVT.is256BitVector() || InVT.is128BitVector())
         && InVT.getScalarSizeInBits() <= 16 &&
         Subtarget->hasBWI() && Subtarget->hasVLX())
       return Op; // legal, will go to VPMOVB2M, VPMOVW2M
     if (InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 &&
         Subtarget->hasDQI())
       return Op; // legal, will go to VPMOVD2M, VPMOVQ2M
-    if ((InVT.is256BitVector() || InVT.is128BitVector()) 
+    if ((InVT.is256BitVector() || InVT.is128BitVector())
         && InVT.getScalarSizeInBits() >= 32 &&
         Subtarget->hasDQI() && Subtarget->hasVLX())
       return Op; // legal, will go to VPMOVB2M, VPMOVQ2M
@@ -12224,7 +12302,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
     if (Subtarget->hasInt256()) {
       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
-      In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
+      In = DAG.getBitcast(MVT::v8i32, In);
       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
                                 ShufMask);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
@@ -12235,8 +12313,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
                                DAG.getIntPtrConstant(0, DL));
     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
                                DAG.getIntPtrConstant(2, DL));
-    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
-    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
+    OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
+    OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
     static const int ShufMask[] = {0, 2, 4, 6};
     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
   }
@@ -12244,7 +12322,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
     if (Subtarget->hasInt256()) {
-      In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
+      In = DAG.getBitcast(MVT::v32i8, In);
 
       SmallVector<SDValue,32> pshufbMask;
       for (unsigned i = 0; i < 2; ++i) {
@@ -12261,14 +12339,14 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
       }
       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
-      In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
+      In = DAG.getBitcast(MVT::v4i64, In);
 
       static const int ShufMask[] = {0,  2,  -1,  -1};
       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
                                 &ShufMask[0]);
       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
                        DAG.getIntPtrConstant(0, DL));
-      return DAG.getNode(ISD::BITCAST, DL, VT, In);
+      return DAG.getBitcast(VT, In);
     }
 
     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
@@ -12277,8 +12355,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
                                DAG.getIntPtrConstant(4, DL));
 
-    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
-    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
+    OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
+    OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
 
     // The PSHUFB mask:
     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
@@ -12288,13 +12366,13 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
 
-    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
-    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
+    OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
+    OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
 
     // The MOVLHPS Mask:
     static const int ShufMask2[] = {0, 1, 4, 5};
     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
-    return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
+    return DAG.getBitcast(MVT::v8i16, res);
   }
 
   // Handle truncation of V256 to V128 using shuffles.
@@ -12310,8 +12388,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   // Prepare truncation shuffle mask
   for (unsigned i = 0; i != NumElems; ++i)
     MaskVec[i] = i * 2;
-  SDValue V = DAG.getVectorShuffle(NVT, DL,
-                                   DAG.getNode(ISD::BITCAST, DL, NVT, In),
+  SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
                                    DAG.getUNDEF(NVT), &MaskVec[0]);
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
                      DAG.getIntPtrConstant(0, DL));
@@ -12420,13 +12497,12 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
     // For a vector, cast operands to a vector type, perform the logic op,
     // and cast the result back to the original value type.
     MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
-    SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
-    SDValue Operand = IsFNABS ?
-      DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
-      DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
+    SDValue MaskCasted = DAG.getBitcast(VecVT, Mask);
+    SDValue Operand = IsFNABS ? DAG.getBitcast(VecVT, Op0.getOperand(0))
+                              : DAG.getBitcast(VecVT, Op0);
     unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
-    return DAG.getNode(ISD::BITCAST, dl, VT,
-                       DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
+    return DAG.getBitcast(VT,
+                          DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
   }
 
   // If not vector, then scalar.
@@ -12591,7 +12667,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
 
   // Cast all vectors into TestVT for PTEST.
   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
-    VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
+    VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
 
   // If more than one full vectors are evaluated, OR them first before PTEST.
   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
@@ -12925,29 +13001,31 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
                                             DAGCombinerInfo &DCI,
                                             unsigned &RefinementSteps,
                                             bool &UseOneConstNR) const {
-  // FIXME: We should use instruction latency models to calculate the cost of
-  // each potential sequence, but this is very hard to do reliably because
-  // at least Intel's Core* chips have variable timing based on the number of
-  // significant digits in the divisor and/or sqrt operand.
-  if (!Subtarget->useSqrtEst())
-    return SDValue();
-
   EVT VT = Op.getValueType();
+  const char *RecipOp;
 
-  // SSE1 has rsqrtss and rsqrtps.
+  // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
   // TODO: Add support for AVX512 (v16f32).
   // It is likely not profitable to do this for f64 because a double-precision
   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
   // instructions: convert to single, rsqrtss, convert back to double, refine
   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
   // along with FMA, this could be a throughput win.
-  if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
-      (Subtarget->hasAVX() && VT == MVT::v8f32)) {
-    RefinementSteps = 1;
-    UseOneConstNR = false;
-    return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
-  }
-  return SDValue();
+  if (VT == MVT::f32 && Subtarget->hasSSE1())
+    RecipOp = "sqrtf";
+  else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
+           (VT == MVT::v8f32 && Subtarget->hasAVX()))
+    RecipOp = "vec-sqrtf";
+  else
+    return SDValue();
+  
+  TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+  if (!Recips.isEnabled(RecipOp))
+    return SDValue();
+  
+  RefinementSteps = Recips.getRefinementSteps(RecipOp);
+  UseOneConstNR = false;
+  return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
 }
 
 /// The minimum architected relative accuracy is 2^-12. We need one
@@ -12955,15 +13033,9 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
                                             DAGCombinerInfo &DCI,
                                             unsigned &RefinementSteps) const {
-  // FIXME: We should use instruction latency models to calculate the cost of
-  // each potential sequence, but this is very hard to do reliably because
-  // at least Intel's Core* chips have variable timing based on the number of
-  // significant digits in the divisor.
-  if (!Subtarget->useReciprocalEst())
-    return SDValue();
-
   EVT VT = Op.getValueType();
-
+  const char *RecipOp;
+  
   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
   // TODO: Add support for AVX512 (v16f32).
   // It is likely not profitable to do this for f64 because a double-precision
@@ -12971,12 +13043,20 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
   // 15 instructions: convert to single, rcpss, convert back to double, refine
   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
   // along with FMA, this could be a throughput win.
-  if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
-      (Subtarget->hasAVX() && VT == MVT::v8f32)) {
-    RefinementSteps = ReciprocalEstimateRefinementSteps;
-    return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
-  }
-  return SDValue();
+  if (VT == MVT::f32 && Subtarget->hasSSE1())
+    RecipOp = "divf";
+  else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
+           (VT == MVT::v8f32 && Subtarget->hasAVX()))
+    RecipOp = "vec-divf";
+  else
+    return SDValue();
+  
+  TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+  if (!Recips.isEnabled(RecipOp))
+    return SDValue();
+
+  RefinementSteps = Recips.getRefinementSteps(RecipOp);
+  return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
 }
 
 /// If we have at least two divisions that use the same divisor, convert to
@@ -13407,8 +13487,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
       assert(Subtarget->hasSSE2() && "Don't know how to lower!");
 
       // First cast everything to the right type.
-      Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
-      Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
+      Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+      Op1 = DAG.getBitcast(MVT::v4i32, Op1);
 
       // Since SSE has no unsigned integer comparisons, we need to flip the sign
       // bits of the inputs before performing those operations. The lower
@@ -13442,7 +13522,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
       if (Invert)
         Result = DAG.getNOT(dl, Result, MVT::v4i32);
 
-      return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+      return DAG.getBitcast(VT, Result);
     }
 
     if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
@@ -13451,8 +13531,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
       assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
 
       // First cast everything to the right type.
-      Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
-      Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
+      Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+      Op1 = DAG.getBitcast(MVT::v4i32, Op1);
 
       // Do the compare.
       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
@@ -13465,7 +13545,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
       if (Invert)
         Result = DAG.getNOT(dl, Result, MVT::v4i32);
 
-      return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+      return DAG.getBitcast(VT, Result);
     }
   }
 
@@ -13662,7 +13742,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
 
         EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
-        VCmp = DAG.getNode(ISD::BITCAST, DL, VCmpVT, VCmp);
+        VCmp = DAG.getBitcast(VCmpVT, VCmp);
 
         SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
 
@@ -13687,12 +13767,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
         Op2Scalar = Op2.getOperand(0);
       if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
-        SDValue newSelect = DAG.getNode(ISD::SELECT, DL, 
+        SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
                                         Op1Scalar.getValueType(),
                                         Cond, Op1Scalar, Op2Scalar);
         if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
-          return DAG.getNode(ISD::BITCAST, DL, VT, newSelect);
-        SDValue ExtVec = DAG.getNode(ISD::BITCAST, DL, MVT::v8i1, newSelect);
+          return DAG.getBitcast(VT, newSelect);
+        SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
                            DAG.getIntPtrConstant(0, DL));
     }
@@ -13975,7 +14055,7 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
     Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
     MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
     CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
-    Curr = DAG.getNode(ISD::BITCAST, dl, CurrVT, Curr);
+    Curr = DAG.getBitcast(CurrVT, Curr);
   }
 
   SDValue SignExt = Curr;
@@ -13993,7 +14073,7 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
     SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
                                DAG.getConstant(31, dl, MVT::i8));
     SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
-    return DAG.getNode(ISD::BITCAST, dl, VT, Ext);
+    return DAG.getBitcast(VT, Ext);
   }
 
   return SDValue();
@@ -14202,7 +14282,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
 
   // Bitcast the loaded value to a vector of the original element type, in
   // the size of the target vector type.
-  SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
+  SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
   unsigned SizeRatio = RegSz / MemSz;
 
   if (Ext == ISD::SEXTLOAD) {
@@ -14227,7 +14307,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
     SDValue Shuff = DAG.getVectorShuffle(
         WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
 
-    Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+    Shuff = DAG.getBitcast(RegVT, Shuff);
 
     // Build the arithmetic shift.
     unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
@@ -14249,7 +14329,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
                                        DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
 
   // Bitcast to the requested type.
-  Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+  Shuff = DAG.getBitcast(RegVT, Shuff);
   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
   return Shuff;
 }
@@ -14933,7 +15013,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
   MVT EltVT = VT.getVectorElementType();
   EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
 
-  ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
+  ShAmt = DAG.getBitcast(ShVT, ShAmt);
   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
 }
 
@@ -14959,8 +15039,8 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
     // are extracted by EXTRACT_SUBVECTOR.
     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                              DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
-                              DAG.getIntPtrConstant(0, dl));
+                                DAG.getBitcast(BitcastVT, Mask),
+                                DAG.getIntPtrConstant(0, dl));
 
     switch (Op.getOpcode()) {
       default: break;
@@ -15017,12 +15097,31 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
         Op.getOperand(2), Op.getOperand(3));
     case INTR_TYPE_1OP_MASK_RM: {
       SDValue Src = Op.getOperand(1);
-      SDValue Src0 = Op.getOperand(2);
+      SDValue PassThru = Op.getOperand(2);
       SDValue Mask = Op.getOperand(3);
-      SDValue RoundingMode = Op.getOperand(4);
+      SDValue RoundingMode;
+      if (Op.getNumOperands() == 4)
+        RoundingMode = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+      else
+        RoundingMode = Op.getOperand(4);
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        unsigned Round = cast<ConstantSDNode>(RoundingMode)->getZExtValue();
+        if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) 
+          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                      dl, Op.getValueType(), Src, RoundingMode),
+                                      Mask, PassThru, Subtarget, DAG);
+      }
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
                                               RoundingMode),
-                                  Mask, Src0, Subtarget, DAG);
+                                  Mask, PassThru, Subtarget, DAG);
+    }
+    case INTR_TYPE_1OP_MASK: {
+      SDValue Src = Op.getOperand(1);
+      SDValue Passthru = Op.getOperand(2);
+      SDValue Mask = Op.getOperand(3);
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
+                                  Mask, Passthru, Subtarget, DAG);
     }
     case INTR_TYPE_SCALAR_MASK_RM: {
       SDValue Src1 = Op.getOperand(1);
@@ -15069,6 +15168,30 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                               Src1,Src2),
                                   Mask, PassThru, Subtarget, DAG);
     }
+    case INTR_TYPE_3OP_MASK: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src3 = Op.getOperand(3);
+      SDValue PassThru = Op.getOperand(4);
+      SDValue Mask = Op.getOperand(5);
+      // We specify 2 possible opcodes for intrinsics with rounding modes.
+      // First, we check if the intrinsic may have non-default rounding mode,
+      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(6);
+        unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+        if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                      dl, Op.getValueType(),
+                                      Src1, Src2, Src3, Rnd),
+                                      Mask, PassThru, Subtarget, DAG);
+        }
+      }
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+                                              Src1, Src2, Src3),
+                                  Mask, PassThru, Subtarget, DAG);
+    }
     case FMA_OP_MASK: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
@@ -15140,7 +15263,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
                                 DAG.getUNDEF(BitcastVT), CmpMask,
                                 DAG.getIntPtrConstant(0, dl));
-      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+      return DAG.getBitcast(Op.getValueType(), Res);
     }
     case COMI: { // Comparison intrinsics
       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
@@ -15176,7 +15299,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                        Mask.getValueType().getSizeInBits());
       SDLoc dl(Op);
       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                                  DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+                                  DAG.getBitcast(BitcastVT, Mask),
                                   DAG.getIntPtrConstant(0, dl));
 
       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
@@ -15191,7 +15314,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                        Mask.getValueType().getSizeInBits());
       SDLoc dl(Op);
       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                                  DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+                                  DAG.getBitcast(BitcastVT, Mask),
                                   DAG.getIntPtrConstant(0, dl));
       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
                          Op.getOperand(2));
@@ -15211,16 +15334,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
     return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(1));
 
-  case Intrinsic::x86_avx512_mask_valign_q_512:
-  case Intrinsic::x86_avx512_mask_valign_d_512:
-    // Vector source operands are swapped.
-    return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
-                                            Op.getValueType(), Op.getOperand(2),
-                                            Op.getOperand(1),
-                                            Op.getOperand(3)),
-                                Op.getOperand(5), Op.getOperand(4),
-                                Subtarget, DAG);
-
   // ptest and testp intrinsics. The intrinsic these come from are designed to
   // return an integer value, not just an instruction so lower it to the ptest
   // or testp pattern and a setcc for the result.
@@ -15289,8 +15402,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
   case Intrinsic::x86_avx512_kortestz_w:
   case Intrinsic::x86_avx512_kortestc_w: {
     unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
-    SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
-    SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
+    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+    SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
     SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
@@ -15378,7 +15491,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
     // Compute the symbol for the LSDA. We know it'll get emitted later.
     MachineFunction &MF = DAG.getMachineFunction();
     SDValue Op1 = Op.getOperand(1);
-    Op1->dump();
     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
         GlobalValue::getRealLinkageName(Fn->getName()));
@@ -15409,7 +15521,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   if (MaskC)
     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
   else
-    MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+    MaskInReg = DAG.getBitcast(MaskVT, Mask);
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
@@ -15437,7 +15549,7 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   if (MaskC)
     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
   else
-    MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+    MaskInReg = DAG.getBitcast(MaskVT, Mask);
   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
@@ -15460,7 +15572,7 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   if (MaskC)
     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
   else
-    MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+    MaskInReg = DAG.getBitcast(MaskVT, Mask);
   //SDVTList VTs = DAG.getVTList(MVT::Other);
   SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
@@ -15693,23 +15805,25 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
     SDValue Addr = Op.getOperand(2);
     SDValue Chain = Op.getOperand(0);
 
+    EVT VT = DataToCompress.getValueType();
     if (isAllOnes(Mask)) // return just a store
       return DAG.getStore(Chain, dl, DataToCompress, Addr,
-                          MachinePointerInfo(), false, false, 0);
+                          MachinePointerInfo(), false, false,
+                          VT.getScalarSizeInBits()/8);
 
-    EVT VT = DataToCompress.getValueType();
     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
                                   VT.getVectorNumElements());
     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
                                      Mask.getValueType().getSizeInBits());
     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                                DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+                                DAG.getBitcast(BitcastVT, Mask),
                                 DAG.getIntPtrConstant(0, dl));
 
     SDValue Compressed =  DAG.getNode(IntrData->Opc0, dl, VT, VMask,
                                       DataToCompress, DAG.getUNDEF(VT));
     return DAG.getStore(Chain, dl, Compressed, Addr,
-                        MachinePointerInfo(), false, false, 0);
+                        MachinePointerInfo(), false, false,
+                        VT.getScalarSizeInBits()/8);
   }
   case EXPAND_FROM_MEM: {
     SDLoc dl(Op);
@@ -15721,17 +15835,18 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
 
     if (isAllOnes(Mask)) // return just a load
       return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
-                         false, 0);
+                         false, VT.getScalarSizeInBits()/8);
     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
                                   VT.getVectorNumElements());
     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
                                      Mask.getValueType().getSizeInBits());
     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                                DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+                                DAG.getBitcast(BitcastVT, Mask),
                                 DAG.getIntPtrConstant(0, dl));
 
     SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
-                                   false, false, false, 0);
+                                       false, false, false,
+                                       VT.getScalarSizeInBits()/8);
 
     SDValue Results[] = {
         DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand, PathThru),
@@ -16274,8 +16389,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
                               -1, 4, -1, 5, -1, 6, -1, 7};
       ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
       BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
-      ALo = DAG.getNode(ISD::BITCAST, dl, ExVT, ALo);
-      BLo = DAG.getNode(ISD::BITCAST, dl, ExVT, BLo);
+      ALo = DAG.getBitcast(ExVT, ALo);
+      BLo = DAG.getBitcast(ExVT, BLo);
       ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
       BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
     }
@@ -16294,8 +16409,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
                               -1, 12, -1, 13, -1, 14, -1, 15};
       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
-      AHi = DAG.getNode(ISD::BITCAST, dl, ExVT, AHi);
-      BHi = DAG.getNode(ISD::BITCAST, dl, ExVT, BHi);
+      AHi = DAG.getBitcast(ExVT, AHi);
+      BHi = DAG.getBitcast(ExVT, BHi);
       AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
       BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
     }
@@ -16323,8 +16438,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
     // Now multiply odd parts.
     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
 
-    Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
-    Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
+    Evens = DAG.getBitcast(VT, Evens);
+    Odds = DAG.getBitcast(VT, Odds);
 
     // Merge the two vectors back together with a shuffle. This expands into 2
     // shuffles.
@@ -16352,10 +16467,10 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
   // Bit cast to 32-bit vectors for MULUDQ
   EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
                                   (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
-  A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
-  B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
-  Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
-  Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
+  A = DAG.getBitcast(MulVT, A);
+  B = DAG.getBitcast(MulVT, B);
+  Ahi = DAG.getBitcast(MulVT, Ahi);
+  Bhi = DAG.getBitcast(MulVT, Bhi);
 
   SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
   SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
@@ -16417,7 +16532,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
-  return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
+  return DAG.getBitcast(VT, CallInfo.first);
 }
 
 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
@@ -16455,12 +16570,10 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
       (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
   // => <2 x i64> <ae|cg>
-  SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
-                             DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+  SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
   // => <2 x i64> <bf|dh>
-  SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
-                             DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
+  SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
 
   // Shuffle it back into the right order.
   SDValue Highs, Lows;
@@ -16499,16 +16612,16 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
 
 // Return true if the requred (according to Opcode) shift-imm form is natively
 // supported by the Subtarget
-static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget, 
+static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget,
                                         unsigned Opcode) {
   if (VT.getScalarSizeInBits() < 16)
     return false;
- 
+
   if (VT.is512BitVector() &&
       (VT.getScalarSizeInBits() > 16 || Subtarget->hasBWI()))
     return true;
 
-  bool LShift = VT.is128BitVector() || 
+  bool LShift = VT.is128BitVector() ||
     (VT.is256BitVector() && Subtarget->hasInt256());
 
   bool AShift = LShift && (Subtarget->hasVLX() ||
@@ -16518,15 +16631,15 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget,
 
 // The shift amount is a variable, but it is the same for all vector lanes.
 // These instrcutions are defined together with shift-immediate.
-static 
-bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget, 
+static
+bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget,
                                       unsigned Opcode) {
   return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
 }
 
 // Return true if the requred (according to Opcode) variable-shift form is
 // natively supported by the Subtarget
-static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget, 
+static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget,
                                     unsigned Opcode) {
 
   if (!Subtarget->hasInt256() || VT.getScalarSizeInBits() < 16)
@@ -16574,7 +16687,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
           // Make a large shift.
           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
                                                    R, ShiftAmt, DAG);
-          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
+          SHL = DAG.getBitcast(VT, SHL);
           // Zero out the rightmost bits.
           SmallVector<SDValue, 32> V(
               NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, MVT::i8));
@@ -16585,7 +16698,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
           // Make a large shift.
           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
                                                    R, ShiftAmt, DAG);
-          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
+          SRL = DAG.getBitcast(VT, SRL);
           // Zero out the leftmost bits.
           SmallVector<SDValue, 32> V(
               NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, MVT::i8));
@@ -16801,7 +16914,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
 
     Op = DAG.getNode(ISD::ADD, dl, VT, Op,
                      DAG.getConstant(0x3f800000U, dl, VT));
-    Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
+    Op = DAG.getBitcast(MVT::v4f32, Op);
     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
   }
@@ -16871,11 +16984,11 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
       if (TargetOpcode == X86ISD::MOVSD)
         CastVT = MVT::v2i64;
-      SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
-      SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
+      SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
+      SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
       SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
                                             BitCast1, DAG);
-      return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+      return DAG.getBitcast(VT, Result);
     }
   }
 
@@ -16931,10 +17044,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
     SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
     SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, R, R);
     SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, R, R);
-    ALo = DAG.getNode(ISD::BITCAST, dl, ExtVT, ALo);
-    AHi = DAG.getNode(ISD::BITCAST, dl, ExtVT, AHi);
-    RLo = DAG.getNode(ISD::BITCAST, dl, ExtVT, RLo);
-    RHi = DAG.getNode(ISD::BITCAST, dl, ExtVT, RHi);
+    ALo = DAG.getBitcast(ExtVT, ALo);
+    AHi = DAG.getBitcast(ExtVT, AHi);
+    RLo = DAG.getBitcast(ExtVT, RLo);
+    RHi = DAG.getBitcast(ExtVT, RHi);
     SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
     SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
     Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
@@ -17293,7 +17406,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
 
     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
-    SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
+    SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
                        DAG.getIntPtrConstant(0, dl));
   }
@@ -17315,141 +17428,241 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
   return SDValue();
 }
 
-static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
-                          SelectionDAG &DAG) {
-  SDNode *Node = Op.getNode();
-  SDLoc dl(Node);
+/// Compute the horizontal sum of bytes in V for the elements of VT.
+///
+/// Requires V to be a byte vector and VT to be an integer vector type with
+/// wider elements than V's type. The width of the elements of VT determines
+/// how many bytes of V are summed horizontally to produce each element of the
+/// result.
+static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
+                                      const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  SDLoc DL(V);
+  MVT ByteVecVT = V.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  int NumElts = VT.getVectorNumElements();
+  assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
+         "Expected value to have byte element type.");
+  assert(EltVT != MVT::i8 &&
+         "Horizontal byte sum only makes sense for wider elements!");
+  unsigned VecSize = VT.getSizeInBits();
+  assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
+
+  // PSADBW instruction horizontally add all bytes and leave the result in i64
+  // chunks, thus directly computes the pop count for v2i64 and v4i64.
+  if (EltVT == MVT::i64) {
+    SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
+    V = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, V, Zeros);
+    return DAG.getBitcast(VT, V);
+  }
+
+  if (EltVT == MVT::i32) {
+    // We unpack the low half and high half into i32s interleaved with zeros so
+    // that we can use PSADBW to horizontally sum them. The most useful part of
+    // this is that it lines up the results of two PSADBW instructions to be
+    // two v2i64 vectors which concatenated are the 4 population counts. We can
+    // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
+    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
+    SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros);
+    SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros);
+
+    // Do the horizontal sums into two v2i64s.
+    Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
+    Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
+                      DAG.getBitcast(ByteVecVT, Low), Zeros);
+    High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
+                       DAG.getBitcast(ByteVecVT, High), Zeros);
+
+    // Merge them together.
+    MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
+    V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
+                    DAG.getBitcast(ShortVecVT, Low),
+                    DAG.getBitcast(ShortVecVT, High));
+
+    return DAG.getBitcast(VT, V);
+  }
+
+  // The only element type left is i16.
+  assert(EltVT == MVT::i16 && "Unknown how to handle type");
+
+  // To obtain pop count for each i16 element starting from the pop count for
+  // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
+  // right by 8. It is important to shift as i16s as i8 vector shift isn't
+  // directly supported.
+  SmallVector<SDValue, 16> Shifters(NumElts, DAG.getConstant(8, DL, EltVT));
+  SDValue Shifter = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters);
+  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), Shifter);
+  V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
+                  DAG.getBitcast(ByteVecVT, V));
+  return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), Shifter);
+}
+
+static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL,
+                                        const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  unsigned VecSize = VT.getSizeInBits();
 
-  Op = Op.getOperand(0);
-  EVT VT = Op.getValueType();
-  assert((VT.is128BitVector() || VT.is256BitVector()) &&
-         "CTPOP lowering only implemented for 128/256-bit wide vector types");
+  // Implement a lookup table in register by using an algorithm based on:
+  // http://wm.ite.pl/articles/sse-popcount.html
+  //
+  // The general idea is that every lower byte nibble in the input vector is an
+  // index into a in-register pre-computed pop count table. We then split up the
+  // input vector in two new ones: (1) a vector with only the shifted-right
+  // higher nibbles for each byte and (2) a vector with the lower nibbles (and
+  // masked out higher ones) for each byte. PSHUB is used separately with both
+  // to index the in-register table. Next, both are added and the result is a
+  // i8 vector where each element contains the pop count for input byte.
+  //
+  // To obtain the pop count for elements != i8, we follow up with the same
+  // approach and use additional tricks as described below.
+  //
+  const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
+                       /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
+                       /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
+                       /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
+
+  int NumByteElts = VecSize / 8;
+  MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
+  SDValue In = DAG.getBitcast(ByteVecVT, Op);
+  SmallVector<SDValue, 16> LUTVec;
+  for (int i = 0; i < NumByteElts; ++i)
+    LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
+  SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec);
+  SmallVector<SDValue, 16> Mask0F(NumByteElts,
+                                  DAG.getConstant(0x0F, DL, MVT::i8));
+  SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Mask0F);
+
+  // High nibbles
+  SmallVector<SDValue, 16> Four(NumByteElts, DAG.getConstant(4, DL, MVT::i8));
+  SDValue FourV = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Four);
+  SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
+
+  // Low nibbles
+  SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
+
+  // The input vector is used as the shuffle mask that index elements into the
+  // LUT. After counting low and high nibbles, add the vector to obtain the
+  // final pop count per i8 element.
+  SDValue HighPopCnt =
+      DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
+  SDValue LowPopCnt =
+      DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
+  SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
 
-  unsigned NumElts = VT.getVectorNumElements();
-  EVT EltVT = VT.getVectorElementType();
-  unsigned Len = EltVT.getSizeInBits();
+  if (EltVT == MVT::i8)
+    return PopCnt;
+
+  return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
+}
+
+static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  assert(VT.is128BitVector() &&
+         "Only 128-bit vector bitmath lowering supported.");
+
+  int VecSize = VT.getSizeInBits();
+  MVT EltVT = VT.getVectorElementType();
+  int Len = EltVT.getSizeInBits();
 
   // This is the vectorized version of the "best" algorithm from
   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
   // with a minor tweak to use a series of adds + shifts instead of vector
-  // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
-  //
-  //  v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
-  //  v8i32 => Always profitable
-  //
-  // FIXME: There a couple of possible improvements:
-  //
-  // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
-  // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
-  //
-  assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
-         "CTPOP not implemented for this vector element type.");
+  // multiplications. Implemented for all integer vector types. We only use
+  // this when we don't have SSSE3 which allows a LUT-based lowering that is
+  // much faster, even faster than using native popcnt instructions.
+
+  auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
+    MVT VT = V.getSimpleValueType();
+    SmallVector<SDValue, 32> Shifters(
+        VT.getVectorNumElements(),
+        DAG.getConstant(Shifter, DL, VT.getVectorElementType()));
+    return DAG.getNode(OpCode, DL, VT, V,
+                       DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters));
+  };
+  auto GetMask = [&](SDValue V, APInt Mask) {
+    MVT VT = V.getSimpleValueType();
+    SmallVector<SDValue, 32> Masks(
+        VT.getVectorNumElements(),
+        DAG.getConstant(Mask, DL, VT.getVectorElementType()));
+    return DAG.getNode(ISD::AND, DL, VT, V,
+                       DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Masks));
+  };
 
-  // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
-  // extra legalization.
-  bool NeedsBitcast = EltVT == MVT::i32;
-  MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
+  // We don't want to incur the implicit masks required to SRL vNi8 vectors on
+  // x86, so set the SRL type to have elements at least i16 wide. This is
+  // correct because all of our SRLs are followed immediately by a mask anyways
+  // that handles any bits that sneak into the high bits of the byte elements.
+  MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
 
-  SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), dl,
-                                  EltVT);
-  SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), dl,
-                                  EltVT);
-  SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), dl,
-                                  EltVT);
+  SDValue V = Op;
 
   // v = v - ((v >> 1) & 0x55555555...)
-  SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, dl, EltVT));
-  SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
-  SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
-  if (NeedsBitcast)
-    Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
-
-  SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
-  SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
-  if (NeedsBitcast)
-    M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
-
-  SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
-  if (VT != And.getValueType())
-    And = DAG.getNode(ISD::BITCAST, dl, VT, And);
-  SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
+  SDValue Srl =
+      DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
+  SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
+  V = DAG.getNode(ISD::SUB, DL, VT, V, And);
 
   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
-  SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
-  SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
-  SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, dl, EltVT));
-  SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
+  SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
+  Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
+  SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
+  V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
 
-  Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
-  if (NeedsBitcast) {
-    Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
-    M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
-    Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
-  }
+  // v = (v + (v >> 4)) & 0x0F0F0F0F...
+  Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
+  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
+  V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
 
-  SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
-  SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
-  if (VT != AndRHS.getValueType()) {
-    AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
-    AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
-  }
-  SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
+  // At this point, V contains the byte-wise population count, and we are
+  // merely doing a horizontal sum if necessary to get the wider element
+  // counts.
+  if (EltVT == MVT::i8)
+    return V;
 
-  // v = (v + (v >> 4)) & 0x0F0F0F0F...
-  SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, dl, EltVT));
-  SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
-  Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
-  Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
-
-  SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
-  SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
-  if (NeedsBitcast) {
-    Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
-    M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
-  }
-  And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
-  if (VT != And.getValueType())
-    And = DAG.getNode(ISD::BITCAST, dl, VT, And);
-
-  // The algorithm mentioned above uses:
-  //    v = (v * 0x01010101...) >> (Len - 8)
-  //
-  // Change it to use vector adds + vector shifts which yield faster results on
-  // Haswell than using vector integer multiplication.
-  //
-  // For i32 elements:
-  //    v = v + (v >> 8)
-  //    v = v + (v >> 16)
-  //
-  // For i64 elements:
-  //    v = v + (v >> 8)
-  //    v = v + (v >> 16)
-  //    v = v + (v >> 32)
-  //
-  Add = And;
-  SmallVector<SDValue, 8> Csts;
-  for (unsigned i = 8; i <= Len/2; i *= 2) {
-    Csts.assign(NumElts, DAG.getConstant(i, dl, EltVT));
-    SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
-    Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
-    Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
-    Csts.clear();
+  return LowerHorizontalByteSum(
+      DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
+      DAG);
+}
+
+static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget,
+                                SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  // FIXME: Need to add AVX-512 support here!
+  assert((VT.is256BitVector() || VT.is128BitVector()) &&
+         "Unknown CTPOP type to handle");
+  SDLoc DL(Op.getNode());
+  SDValue Op0 = Op.getOperand(0);
+
+  if (!Subtarget->hasSSSE3()) {
+    // We can't use the fast LUT approach, so fall back on vectorized bitmath.
+    assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
+    return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
   }
 
-  // The result is on the least significant 6-bits on i32 and 7-bits on i64.
-  SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), dl,
-                                  EltVT);
-  SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
-  SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
-  if (NeedsBitcast) {
-    Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
-    M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
+  if (VT.is256BitVector() && !Subtarget->hasInt256()) {
+    unsigned NumElems = VT.getVectorNumElements();
+
+    // Extract each 128-bit vector, compute pop count and concat the result.
+    SDValue LHS = Extract128BitVector(Op0, 0, DAG, DL);
+    SDValue RHS = Extract128BitVector(Op0, NumElems/2, DAG, DL);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+                       LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
+                       LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
   }
-  And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
-  if (VT != And.getValueType())
-    And = DAG.getNode(ISD::BITCAST, dl, VT, And);
 
-  return And;
+  return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
+}
+
+static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
+                          SelectionDAG &DAG) {
+  assert(Op.getValueType().isVector() &&
+         "We only do custom lowering for vector population count.");
+  return LowerVectorCTPOP(Op, Subtarget, DAG);
 }
 
 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
@@ -17840,8 +18053,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                      MVT::f64);
     SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
-                             DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
-    Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
+                             DAG.getBitcast(MVT::v2i64, VBias));
+    Or = DAG.getBitcast(MVT::v2f64, Or);
     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
     return;
@@ -17964,7 +18177,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
                                    MVT::v2f64, N->getOperand(0));
-    SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
+    SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
 
     if (ExperimentalVectorWideningLegalization) {
       // If we are legalizing vectors by widening, we already have the desired
@@ -17994,7 +18207,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FANDN:              return "X86ISD::FANDN";
   case X86ISD::FOR:                return "X86ISD::FOR";
   case X86ISD::FXOR:               return "X86ISD::FXOR";
-  case X86ISD::FSRL:               return "X86ISD::FSRL";
   case X86ISD::FILD:               return "X86ISD::FILD";
   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
@@ -18121,6 +18333,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
+  case X86ISD::SHUF128:            return "X86ISD::SHUF128";
   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
@@ -18143,8 +18356,11 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
+  case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
+  case X86ISD::VRANGE:             return "X86ISD::VRANGE";
   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
+  case X86ISD::PSADBW:             return "X86ISD::PSADBW";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
@@ -18184,6 +18400,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
   case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
   case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
+  case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
+  case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
   case X86ISD::ADDS:               return "X86ISD::ADDS";
   case X86ISD::SUBS:               return "X86ISD::SUBS";
   }
@@ -18193,7 +18411,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
 // isLegalAddressingMode - Return true if the addressing mode represented
 // by AM is legal for this target, for a load/store of the specified type.
 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              Type *Ty) const {
+                                              Type *Ty,
+                                              unsigned AS) const {
   // X86 supports extremely general addressing modes.
   CodeModel::Model M = getTargetMachine().getCodeModel();
   Reloc::Model R = getTargetMachine().getRelocationModel();
@@ -20028,7 +20247,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
                                  SDValue(ResNode.getNode(), 1));
         }
 
-        return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
+        return DAG.getBitcast(VT, ResNode);
       }
     }
 
@@ -20087,7 +20306,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
 
   // Just remove no-op shuffle masks.
   if (Mask.size() == 1) {
-    DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
                   /*AddTo*/ true);
     return true;
   }
@@ -20123,14 +20342,14 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
       }
       if (Depth == 1 && Root->getOpcode() == Shuffle)
         return false; // Nothing to do!
-      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+      Op = DAG.getBitcast(ShuffleVT, Input);
       DCI.AddToWorklist(Op.getNode());
       if (Shuffle == X86ISD::MOVDDUP)
         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
       else
         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
       DCI.AddToWorklist(Op.getNode());
-      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
                     /*AddTo*/ true);
       return true;
     }
@@ -20141,11 +20360,11 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
       MVT ShuffleVT = MVT::v4f32;
       if (Depth == 1 && Root->getOpcode() == Shuffle)
         return false; // Nothing to do!
-      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+      Op = DAG.getBitcast(ShuffleVT, Input);
       DCI.AddToWorklist(Op.getNode());
       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
       DCI.AddToWorklist(Op.getNode());
-      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
                     /*AddTo*/ true);
       return true;
     }
@@ -20155,11 +20374,11 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
       MVT ShuffleVT = MVT::v4f32;
       if (Depth == 1 && Root->getOpcode() == Shuffle)
         return false; // Nothing to do!
-      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+      Op = DAG.getBitcast(ShuffleVT, Input);
       DCI.AddToWorklist(Op.getNode());
       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
       DCI.AddToWorklist(Op.getNode());
-      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
                     /*AddTo*/ true);
       return true;
     }
@@ -20189,11 +20408,11 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
     default:
       llvm_unreachable("Impossible mask size!");
     };
-    Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+    Op = DAG.getBitcast(ShuffleVT, Input);
     DCI.AddToWorklist(Op.getNode());
     Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
     DCI.AddToWorklist(Op.getNode());
-    DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
                   /*AddTo*/ true);
     return true;
   }
@@ -20222,14 +20441,14 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
     }
     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
-    Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Input);
+    Op = DAG.getBitcast(ByteVT, Input);
     DCI.AddToWorklist(Op.getNode());
     SDValue PSHUFBMaskOp =
         DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask);
     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
     Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp);
     DCI.AddToWorklist(Op.getNode());
-    DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
                   /*AddTo*/ true);
     return true;
   }
@@ -20401,7 +20620,7 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
 #ifndef NDEBUG
     for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
       for (int j = 0; j < LaneElts; ++j)
-        assert(Mask[j] == Mask[i * LaneElts + j] - LaneElts &&
+        assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
                "Mask doesn't repeat in high 128-bit lanes!");
 #endif
     Mask.resize(LaneElts);
@@ -20532,7 +20751,7 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
     SDValue W = Chain.pop_back_val();
 
     if (V.getValueType() != W.getOperand(0).getValueType())
-      V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V);
+      V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
 
     switch (W.getOpcode()) {
     default:
@@ -20551,7 +20770,7 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
     }
   }
   if (V.getValueType() != N.getValueType())
-    V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V);
+    V = DAG.getBitcast(N.getValueType(), V);
 
   // Return the new chain to replace N.
   return V;
@@ -20668,12 +20887,12 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
       DMask[DOffset + 0] = DOffset + 1;
       DMask[DOffset + 1] = DOffset + 0;
       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
-      V = DAG.getNode(ISD::BITCAST, DL, DVT, V);
+      V = DAG.getBitcast(DVT, V);
       DCI.AddToWorklist(V.getNode());
       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
       DCI.AddToWorklist(V.getNode());
-      return DAG.getNode(ISD::BITCAST, DL, VT, V);
+      return DAG.getBitcast(VT, V);
     }
 
     // Look for shuffle patterns which can be implemented as a single unpack.
@@ -20704,7 +20923,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
         if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
             makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
           // We can replace all three shuffles with an unpack.
-          V = DAG.getNode(ISD::BITCAST, DL, VT, D.getOperand(0));
+          V = DAG.getBitcast(VT, D.getOperand(0));
           DCI.AddToWorklist(V.getNode());
           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
                                                 : X86ISD::UNPCKH,
@@ -20848,8 +21067,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
         CanFold = SVOp->getMaskElt(i) < 0;
 
       if (CanFold) {
-        SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
-        SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
+        SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
+        SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
       }
@@ -20981,7 +21200,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
   Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
                                  InVec.getOperand(0), Shuffle,
                                  &ShuffleMask[0]);
-  Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
+  Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
                      EltNo);
 }
@@ -21101,7 +21320,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
   SDValue Vals[4];
 
   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
-    SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
+    SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
       DAG.getConstant(0, dl, VecIdxTy));
@@ -21717,13 +21936,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
       if (TValIsAllOnes && FValIsAllZeros)
         Ret = Cond;
       else if (TValIsAllOnes)
-        Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
-                          DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
+        Ret =
+            DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
       else if (FValIsAllZeros)
         Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
-                          DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
+                          DAG.getBitcast(CondVT, LHS));
 
-      return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
+      return DAG.getBitcast(VT, Ret);
     }
   }
 
@@ -22554,15 +22773,13 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
             // and work with those going forward.
             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
                                            OnesOrZeroesF);
-            SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
-                                           Vector64);
+            SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
                                         Vector32, DAG.getIntPtrConstant(0, DL));
             IntVT = MVT::i32;
           }
 
-          SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT,
-                                              OnesOrZeroesF);
+          SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
                                       DAG.getConstant(1, DL, IntVT));
           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
@@ -22775,7 +22992,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
 
   SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
     Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask);
-  return DAG.getNode(ISD::BITCAST, DL, N0.getValueType(), NewShuffle);
+  return DAG.getBitcast(N0.getValueType(), NewShuffle);
 }
 
 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
@@ -22916,7 +23133,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
         assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
                "Unsupported VT for PSIGN");
         Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
-        return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
+        return DAG.getBitcast(VT, Mask);
       }
       // PBLENDVB only available on SSE 4.1
       if (!Subtarget->hasSSE41())
@@ -22924,11 +23141,11 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
 
       EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
 
-      X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
-      Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
-      Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
+      X = DAG.getBitcast(BlendVT, X);
+      Y = DAG.getBitcast(BlendVT, Y);
+      Mask = DAG.getBitcast(BlendVT, Mask);
       Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
-      return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
+      return DAG.getBitcast(VT, Mask);
     }
   }
 
@@ -23129,7 +23346,7 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
 
   // Convert Src0 value
-  SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
+  SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
   if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i != NumElems; ++i)
@@ -23146,7 +23363,7 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
   SDValue Mask = Mld->getMask();
   if (Mask.getValueType() == VT) {
     // Mask and original value have the same type
-    NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
+    NewMask = DAG.getBitcast(WideVecVT, Mask);
     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i != NumElems; ++i)
       ShuffleVec[i] = i * SizeRatio;
@@ -23214,7 +23431,7 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
 
   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
 
-  SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
+  SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   for (unsigned i = 0; i != NumElems; ++i)
     ShuffleVec[i] = i * SizeRatio;
@@ -23231,7 +23448,7 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
   SDValue Mask = Mst->getMask();
   if (Mask.getValueType() == VT) {
     // Mask and original value have the same type
-    NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
+    NewMask = DAG.getBitcast(WideVecVT, Mask);
     for (unsigned i = 0; i != NumElems; ++i)
       ShuffleVec[i] = i * SizeRatio;
     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
@@ -23323,7 +23540,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
 
     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
 
-    SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
+    SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i != NumElems; ++i)
       ShuffleVec[i] = i * SizeRatio;
@@ -23354,7 +23571,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
-    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
+    SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
     SmallVector<SDValue, 8> Chains;
     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, dl,
                                         TLI.getPointerTy());
@@ -23495,7 +23712,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     SDValue ExtOp0 = OldExtract.getOperand(0);
     unsigned VecSize = ExtOp0.getValueSizeInBits();
     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
-    SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtOp0);
+    SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                                      BitCast, OldExtract.getOperand(1));
     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
@@ -24239,10 +24456,10 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
     // DAG.
     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
     // The AND node needs bitcasts to/from an integer vector type around it.
-    SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
+    SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
                                  N->getOperand(0)->getOperand(0), MaskConst);
-    SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
+    SDValue Res = DAG.getBitcast(VT, NewAnd);
     return Res;
   }
 
@@ -24442,8 +24659,7 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
     // In this case, the inner vzext is completely dead because we're going to
     // only look at bits inside of the low element. Just do the outer vzext on
     // a bitcast of the input to the inner.
-    return DAG.getNode(X86ISD::VZEXT, DL, VT,
-                       DAG.getNode(ISD::BITCAST, DL, OpVT, V));
+    return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
   }
 
   // Check if we can bypass extracting and re-inserting an element of an input
@@ -24465,7 +24681,7 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
                               DAG.getIntPtrConstant(0, DL));
         }
-        Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
+        Op = DAG.getBitcast(OpVT, OrigV);
         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
       }
   }
@@ -25301,6 +25517,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         Res.first = DestReg;
         Res.second = &X86::GR64RegClass;
       }
+    } else if (VT != MVT::Other) {
+      // Type mismatch and not a clobber: Return an error;
+      Res.first = 0;
+      Res.second = nullptr;
     }
   } else if (Res.second == &X86::FR32RegClass ||
              Res.second == &X86::FR64RegClass ||
@@ -25326,13 +25546,23 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       Res.second = &X86::VR256RegClass;
     else if (X86::VR512RegClass.hasType(VT))
       Res.second = &X86::VR512RegClass;
+    else if (VT != MVT::Other) {
+      // Type mismatch and not a clobber: Return an error;
+      Res.first = 0;
+      Res.second = nullptr;
+    }
+  } else if (VT != MVT::Other) {
+    // Type mismatch and not a clobber: Return an error;
+    Res.first = 0;
+    Res.second = nullptr;
   }
 
   return Res;
 }
 
 int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
-                                            Type *Ty) const {
+                                            Type *Ty,
+                                            unsigned AS) const {
   // Scaling factors are not free at all.
   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
   // will take 2 allocations in the out of order engine instead of 1
@@ -25351,7 +25581,7 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
   // E.g., on Haswell:
   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
-  if (isLegalAddressingMode(AM, Ty))
+  if (isLegalAddressingMode(AM, Ty, AS))
     // Scale represents reg2 * scale, thus account for 1
     // as soon as we use a second register.
     return AM.Scale != 0;
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index b589ca42e56c..b5d062f72b24 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -56,10 +56,6 @@ namespace llvm {
       /// corresponds to X86::ANDNPS or X86::ANDNPD.
       FANDN,
 
-      /// Bitwise logical right shift of floating point values. This
-      /// corresponds to X86::PSRLDQ.
-      FSRL,
-
       /// These operations represent an abstract X86 call
       /// instruction, which includes a bunch of information.  In particular the
       /// operands of these node are:
@@ -184,6 +180,9 @@ namespace llvm {
       /// Shuffle 16 8-bit values within a vector.
       PSHUFB,
 
+      /// Compute Sum of Absolute Differences.
+      PSADBW,
+
       /// Bitwise Logical AND NOT of Packed FP values.
       ANDNP,
 
@@ -200,6 +199,7 @@ namespace llvm {
 
       /// Combined add and sub on an FP vector.
       ADDSUB,
+
       //  FP vector ops with rounding mode.
       FADD_RND,
       FSUB_RND,
@@ -207,7 +207,11 @@ namespace llvm {
       FDIV_RND,
       FMAX_RND,
       FMIN_RND,
-      
+      FSQRT_RND,
+
+      // FP vector get exponent 
+      FGETEXP_RND,
+
       // Integer add/sub with unsigned saturation.
       ADDUS,
       SUBUS,
@@ -355,6 +359,8 @@ namespace llvm {
       PSHUFHW,
       PSHUFLW,
       SHUFP,
+      //Shuffle Packed Values at 128-bit granularity
+      SHUF128,
       MOVDDUP,
       MOVSHDUP,
       MOVSLDUP,
@@ -374,6 +380,10 @@ namespace llvm {
       VPERMIV3,
       VPERMI,
       VPERM2X128,
+      //Fix Up Special Packed Float32/64 values
+      VFIXUPIMM,
+      //Range Restriction Calculation For Packed Pairs of Float32/64 values
+      VRANGE,
       // Broadcast scalar to vector
       VBROADCAST,
       // Broadcast subvector to vector
@@ -729,7 +739,8 @@ namespace llvm {
 
     /// Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
-    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
+                               unsigned AS) const override;
 
     /// Return true if the specified immediate is legal
     /// icmp immediate, that is the target has icmp instructions which can
@@ -748,7 +759,8 @@ namespace llvm {
     /// of the specified type.
     /// If the AM is supported, the return value must be >= 0.
     /// If the AM is not supported, it returns a negative value.
-    int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override;
+    int getScalingFactorCost(const AddrMode &AM, Type *Ty,
+                             unsigned AS) const override;
 
     bool isVectorShiftByScalarCheap(Type *Ty) const override;
 
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 9d11d3c7050f..c1d0aef07118 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -1047,12 +1047,6 @@ multiclass avx512_permil<bits<8> OpcImm, bits<8> OpcVar, X86VectorVTInfo _,
              EVEX_4V;
   }
 }
-
-defm VPERMQZ :    avx512_perm_imm<0x00, "vpermq", X86VPermi, v8i64_info>,
-                  EVEX_V512, VEX_W;
-defm VPERMPDZ :   avx512_perm_imm<0x01, "vpermpd", X86VPermi, v8f64_info>,
-                  EVEX_V512, VEX_W;
-
 defm VPERMILPSZ : avx512_permil<0x04, 0x0C, v16f32_info, v16i32_info>,
                   EVEX_V512;
 defm VPERMILPDZ : avx512_permil<0x05, 0x0D, v8f64_info, v8i64_info>,
@@ -1063,37 +1057,6 @@ def : Pat<(v16i32 (X86VPermilpi VR512:$src1, (i8 imm:$imm))),
 def : Pat<(v8i64 (X86VPermilpi VR512:$src1, (i8 imm:$imm))),
           (VPERMILPDZri VR512:$src1, imm:$imm)>;
 
-// -- VPERM - register form --
-multiclass avx512_perm<bits<8> opc, string OpcodeStr, RegisterClass RC,
-                     PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT> {
-
-  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
-                   (ins RC:$src1, RC:$src2),
-                   !strconcat(OpcodeStr,
-                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                   [(set RC:$dst,
-                     (OpVT (X86VPermv RC:$src1, RC:$src2)))]>, EVEX_4V;
-
-  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
-                   (ins RC:$src1, x86memop:$src2),
-                   !strconcat(OpcodeStr,
-                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                   [(set RC:$dst,
-                     (OpVT (X86VPermv RC:$src1, (mem_frag addr:$src2))))]>,
-                     EVEX_4V;
-}
-
-defm VPERMDZ   : avx512_perm<0x36, "vpermd",  VR512,  loadv16i32, i512mem,
-                           v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPERMQZ   : avx512_perm<0x36, "vpermq",  VR512,  loadv8i64,  i512mem,
-                           v8i64>,  EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-let ExeDomain = SSEPackedSingle in
-defm VPERMPSZ  : avx512_perm<0x16, "vpermps", VR512,  loadv16f32, f512mem,
-                           v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-let ExeDomain = SSEPackedDouble in
-defm VPERMPDZ  : avx512_perm<0x16, "vpermpd", VR512,  loadv8f64, f512mem,
-                           v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
 // -- VPERM2I - 3 source operands form --
 multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr, RegisterClass RC,
                           PatFrag mem_frag, X86MemOperand x86memop,
@@ -3401,32 +3364,6 @@ defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64,
                                 VR512, loadv8i64, i512mem>, EVEX_V512,
                                 VEX_W, EVEX_CD8<64, CD8VF>;
 //===----------------------------------------------------------------------===//
-// AVX-512 - PSHUFD
-//
-
-multiclass avx512_pshuf_imm<bits<8> opc, string OpcodeStr, RegisterClass RC,
-                         SDNode OpNode, PatFrag mem_frag,
-                         X86MemOperand x86memop, ValueType OpVT> {
-  def ri : AVX512Ii8<opc, MRMSrcReg, (outs RC:$dst),
-                     (ins RC:$src1, u8imm:$src2),
-                     !strconcat(OpcodeStr,
-                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                     [(set RC:$dst,
-                       (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>,
-                     EVEX;
-  def mi : AVX512Ii8<opc, MRMSrcMem, (outs RC:$dst),
-                     (ins x86memop:$src1, u8imm:$src2),
-                     !strconcat(OpcodeStr,
-                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                     [(set RC:$dst,
-                       (OpVT (OpNode (mem_frag addr:$src1),
-                              (i8 imm:$src2))))]>, EVEX;
-}
-
-defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, loadv16i32,
-                      i512mem, v16i32>, PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
-
-//===----------------------------------------------------------------------===//
 // AVX-512  Logical Instructions
 //===----------------------------------------------------------------------===//
 
@@ -3729,14 +3666,14 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
                    (ins _.RC:$src1, u8imm:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))),
-                   SSE_INTSHIFT_ITINS_P.rr>, AVX512BIi8Base, EVEX_4V;
+                   SSE_INTSHIFT_ITINS_P.rr>;
   let mayLoad = 1 in
   defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
                    (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
                           (i8 imm:$src2))),
-                   SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V;
+                   SSE_INTSHIFT_ITINS_P.rm>;
 }
 
 multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
@@ -3746,7 +3683,7 @@ multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
                    (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
       "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2",
      (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2))),
-     SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V, EVEX_B;
+     SSE_INTSHIFT_ITINS_P.rm>, EVEX_B;
 }
 
 multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -3836,16 +3773,16 @@ multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq,
 }
 
 defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli>,
-             avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli>;
+             avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli>, AVX512BIi8Base, EVEX_4V;
 
 defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli>,
-             avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli>;
+             avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli>, AVX512BIi8Base, EVEX_4V;
 
 defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai>,
-             avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai>;
+             avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai>, AVX512BIi8Base, EVEX_4V;
 
-defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", rotr>;
-defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", rotl>;
+defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", rotr>, AVX512BIi8Base, EVEX_4V;
+defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", rotl>, AVX512BIi8Base, EVEX_4V;
 
 defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>;
 defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>;
@@ -3865,7 +3802,8 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
-                   (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2))),
+                   (_.VT (OpNode _.RC:$src1,
+                   (_.VT (bitconvert (_.LdFrag addr:$src2))))),
                    SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V,
                    EVEX_CD8<_.EltSize, CD8VF>;
 }
@@ -3927,6 +3865,65 @@ defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>,
 defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>;
 defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>;
 
+//===-------------------------------------------------------------------===//
+// 1-src variable permutation VPERMW/D/Q
+//===-------------------------------------------------------------------===//
+multiclass avx512_vperm_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  AVX512VLVectorVTInfo _> {
+  let Predicates  = [HasAVX512] in
+  defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>,
+           avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+
+  let Predicates = [HasAVX512, HasVLX] in
+  defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>,
+              avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+}
+
+multiclass avx512_vpermi_dq_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
+                                 string OpcodeStr, SDNode OpNode,
+                                 AVX512VLVectorVTInfo VTInfo> {
+  let Predicates = [HasAVX512] in
+  defm Z:    avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info512>,
+             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info512>, EVEX_V512;
+  let Predicates = [HasAVX512, HasVLX] in
+  defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info256>,
+             avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+                              VTInfo.info256>, EVEX_V256;
+}
+
+
+defm VPERM  : avx512_var_shift_w<0x8D, "vpermw", X86VPermv>;
+
+defm VPERMD : avx512_vperm_dq_sizes<0x36, "vpermd", X86VPermv,
+                                    avx512vl_i32_info>;
+defm VPERMQ : avx512_vperm_dq_sizes<0x36, "vpermq", X86VPermv,
+                                    avx512vl_i64_info>, VEX_W;
+defm VPERMPS : avx512_vperm_dq_sizes<0x16, "vpermps", X86VPermv,
+                                    avx512vl_f32_info>;
+defm VPERMPD : avx512_vperm_dq_sizes<0x16, "vpermpd", X86VPermv,
+                                    avx512vl_f64_info>, VEX_W;
+
+defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq",
+                             X86VPermi, avx512vl_i64_info>,
+                             EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd",
+                             X86VPermi, avx512vl_f64_info>,
+                             EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW
+//===----------------------------------------------------------------------===//
+
+defm VPSHUFD : avx512_shift_rmi_sizes<0x70, MRMSrcReg, MRMSrcMem, "vpshufd",
+                             X86PShufd, avx512vl_i32_info>, 
+                             EVEX, AVX512BIi8Base, EVEX_CD8<32, CD8VF>;
+defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw",
+                                  X86PShufhw>, EVEX, AVX512XSIi8Base, VEX_W;
+defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw",
+                                  X86PShuflw>, EVEX, AVX512XDIi8Base, VEX_W;
 //===----------------------------------------------------------------------===//
 // AVX-512 - MOVDDUP
 //===----------------------------------------------------------------------===//
@@ -4869,11 +4866,6 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
                          (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>;
 
-  defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                        (ins _.RC:$src), OpcodeStr,
-                        "{sae}, $src", "$src, {sae}",
-                        (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>, EVEX_B;
-
   defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
                          (OpNode (_.FloatVT
@@ -4881,24 +4873,58 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                           (i32 FROUND_CURRENT))>;
 
   defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                         (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+                         (ins _.MemOp:$src), OpcodeStr,
+                         "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
                          (OpNode (_.FloatVT
                                   (X86VBroadcast (_.ScalarLdFrag addr:$src))),
                                  (i32 FROUND_CURRENT))>, EVEX_B;
 }
+multiclass avx512_fp28_p_round<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         SDNode OpNode> {
+  defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _.RC:$src), OpcodeStr,
+                        "{sae}, $src", "$src, {sae}",
+                        (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>, EVEX_B;
+}
 
 multiclass  avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode> {
    defm PS : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode>,
-                     EVEX_CD8<32, CD8VF>;
+             avx512_fp28_p_round<opc, OpcodeStr#"ps", v16f32_info, OpNode>,
+             T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
    defm PD : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode>,
-                     VEX_W, EVEX_CD8<32, CD8VF>;
+             avx512_fp28_p_round<opc, OpcodeStr#"pd", v8f64_info, OpNode>,
+             T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
 }
 
+multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode> {
+  // Define only if AVX512VL feature is present.
+  let Predicates = [HasVLX] in {
+    defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode>,
+                                     EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>;
+    defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode>,
+                                     EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>;
+    defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode>,
+                                     EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
+    defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode>,
+                                     EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
+  }
+}
 let Predicates = [HasERI], hasSideEffects = 0 in {
 
- defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX, EVEX_V512, T8PD;
- defm VRCP28   : avx512_eri<0xCA, "vrcp28",   X86rcp28>,   EVEX, EVEX_V512, T8PD;
- defm VEXP2    : avx512_eri<0xC8, "vexp2",    X86exp2>,    EVEX, EVEX_V512, T8PD;
+ defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX;
+ defm VRCP28   : avx512_eri<0xCA, "vrcp28",   X86rcp28>,   EVEX;
+ defm VEXP2    : avx512_eri<0xC8, "vexp2",    X86exp2>,    EVEX;
+}
+defm VGETEXP   : avx512_eri<0x42, "vgetexp", X86fgetexpRnd>,
+                 avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexpRnd> , EVEX;
+
+multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
+                              SDNode OpNodeRnd, X86VectorVTInfo _>{
+  defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc",
+                         (_.VT (OpNodeRnd _.RC:$src, (i32 imm:$rc)))>,
+                         EVEX, EVEX_B, EVEX_RC;
 }
 
 multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
@@ -5007,20 +5033,22 @@ multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
   }
 }
 
-defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>;
+multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
+                                          SDNode OpNodeRnd> {
+  defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"), OpNodeRnd,
+                                v16f32_info>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"), OpNodeRnd,
+                                v8f64_info>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+}
+
+defm VSQRT   : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>,
+               avx512_sqrt_packed_all_round<0x51, "vsqrt", X86fsqrtRnd>;
 
 defm VSQRT  : avx512_sqrt_scalar<0x51, "sqrt",
                 int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd,
                 SSE_SQRTSS, SSE_SQRTSD>;
 
 let Predicates = [HasAVX512] in {
-  def : Pat<(v16f32 (int_x86_avx512_sqrt_ps_512 (v16f32 VR512:$src1),
-                    (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_CURRENT)),
-                   (VSQRTPSZr VR512:$src1)>;
-  def : Pat<(v8f64 (int_x86_avx512_sqrt_pd_512 (v8f64 VR512:$src1),
-                    (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_CURRENT)),
-                   (VSQRTPDZr VR512:$src1)>;
-
   def : Pat<(f32 (fsqrt FR32X:$src)),
             (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
   def : Pat<(f32 (fsqrt (load addr:$src))),
@@ -5583,30 +5611,6 @@ def : Pat<(v8i64 (X86Shufp VR512:$src1,
                             (loadv8i64 addr:$src2), (i8 imm:$imm))),
           (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>;
 
-multiclass avx512_valign<X86VectorVTInfo _> {
-  defm rri : AVX512_maskable<0x03, MRMSrcReg, _, (outs _.RC:$dst),
-                     (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
-                     "valign"##_.Suffix,
-                     "$src3, $src2, $src1", "$src1, $src2, $src3",
-                     (_.VT (X86VAlign _.RC:$src2, _.RC:$src1,
-                                      (i8 imm:$src3)))>,
-             AVX512AIi8Base, EVEX_4V;
-
-  // Also match valign of packed floats.
-  def : Pat<(_.FloatVT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 imm:$imm))),
-            (!cast<Instruction>(NAME##rri) _.RC:$src2, _.RC:$src1, imm:$imm)>;
-
-  let mayLoad = 1 in
-  def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs _.RC:$dst),
-                     (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
-                     !strconcat("valign"##_.Suffix,
-                     "\t{$src3, $src2, $src1, $dst|"
-                         "$dst, $src1, $src2, $src3}"),
-                     []>, EVEX_4V;
-}
-defm VALIGND : avx512_valign<v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VALIGNQ : avx512_valign<v8i64_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
-
 // Helper fragments to match sext vXi1 to vXiY.
 def v16i1sextv16i32  : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>;
 def v8i1sextv8i64  : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>;
@@ -5949,7 +5953,7 @@ multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
                                               (_.LdFrag addr:$src))),
                                       _.RC:$src0)))]>,
               EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
-  
+
   let mayLoad = 1 in
   def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
               (ins _.KRCWM:$mask, _.MemOp:$src),
@@ -5958,7 +5962,6 @@ multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
                                       (_.VT (bitconvert (_.LdFrag addr:$src))),
                                      _.ImmAllZerosV)))]>,
               EVEX_KZ, EVEX_CD8<_.EltSize, CD8VT1>;
-  
 }
 
 multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr,
@@ -5979,3 +5982,212 @@ defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>,
                                          EVEX;
 defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>,
                                          EVEX, VEX_W;
+
+//handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
+//                               op(reg_vec2,mem_vec,imm)
+//                               op(reg_vec2,broadcast(eltVt),imm)
+//all instruction created with FROUND_CURRENT
+multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _>{
+  defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
+                      OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (i8 imm:$src3),
+                              (i32 FROUND_CURRENT))>;
+  let mayLoad = 1 in {
+    defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
+                      OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                              (i8 imm:$src3),
+                              (i32 FROUND_CURRENT))>;
+    defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+                      OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
+                      "$src1, ${src2}"##_.BroadcastStr##", $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+                              (i8 imm:$src3),
+                              (i32 FROUND_CURRENT))>, EVEX_B;
+  }
+}
+
+//handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
+//                               op(reg_vec2,mem_vec,imm)
+//                               op(reg_vec2,broadcast(eltVt),imm)
+multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                            X86VectorVTInfo _>{
+  defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
+                      OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (i8 imm:$src3))>;
+  let mayLoad = 1 in {
+    defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
+                      OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                              (i8 imm:$src3))>;
+    defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+                      OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
+                      "$src1, ${src2}"##_.BroadcastStr##", $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
+                              (i8 imm:$src3))>, EVEX_B;
+  }
+}
+
+//handle scalar instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
+//                                      op(reg_vec2,mem_scalar,imm)
+//all instruction created with FROUND_CURRENT
+multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                                           X86VectorVTInfo _> {
+
+  defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
+                      OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (i8 imm:$src3),
+                              (i32 FROUND_CURRENT))>;
+  let mayLoad = 1 in {
+    defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
+                      OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT (scalar_to_vector
+                                        (_.ScalarLdFrag addr:$src2))),
+                              (i8 imm:$src3),
+                              (i32 FROUND_CURRENT))>;
+
+    let isAsmParserOnly = 1 in {
+      defm rmi_alt :AVX512_maskable_in_asm<opc, MRMSrcMem, _, (outs _.FRC:$dst),
+                      (ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
+                      OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
+                      []>;
+    }
+  }
+}
+
+//handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
+multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
+                                             SDNode OpNode, X86VectorVTInfo _>{
+  defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                      (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
+                      OpcodeStr, "$src3,{sae}, $src2, $src1",
+                      "$src1, $src2,{sae}, $src3",
+                      (OpNode (_.VT _.RC:$src1),
+                              (_.VT _.RC:$src2),
+                              (i8 imm:$src3),
+                              (i32 FROUND_NO_EXC))>, EVEX_B;
+}
+//handle scalar instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
+multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr,
+                                             SDNode OpNode, X86VectorVTInfo _> {
+  defm NAME: avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _>;
+}
+
+multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr,
+            AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
+  let Predicates = [prd] in {
+    defm Z    : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+                avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+                                  EVEX_V512;
+
+  }
+  let Predicates = [prd, HasVLX] in {
+    defm Z128 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>,
+                                  EVEX_V128;
+    defm Z256 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>,
+                                  EVEX_V256;
+  }
+}
+
+multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _,
+                                bits<8> opc, SDNode OpNode>{
+  let Predicates = [HasAVX512] in {
+    defm Z    : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+  }
+  let Predicates = [HasAVX512, HasVLX] in {
+    defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
+    defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+  }
+}
+
+multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr,
+                  X86VectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
+  let Predicates = [prd] in {
+     defm Z128 : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, _>,
+                 avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNode, _>;
+  }
+}
+
+defm VFIXUPIMMPD : avx512_common_fp_sae_packed_imm<"vfixupimmpd",
+                              avx512vl_f64_info, 0x54, X86VFixupimm, HasAVX512>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VFIXUPIMMPS : avx512_common_fp_sae_packed_imm<"vfixupimmps",
+                              avx512vl_f32_info, 0x54, X86VFixupimm, HasAVX512>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+
+defm VFIXUPIMMSD: avx512_common_fp_sae_scalar_imm<"vfixupimmsd", f64x_info,
+                                                 0x55, X86VFixupimm, HasAVX512>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VFIXUPIMMSS: avx512_common_fp_sae_scalar_imm<"vfixupimmss", f32x_info,
+                                                 0x55, X86VFixupimm, HasAVX512>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
+                                                       0x50, X86VRange, HasDQI>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info,
+                                                       0x50, X86VRange, HasDQI>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+
+defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd", f64x_info,
+                                                 0x51, X86VRange, HasDQI>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
+defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info,
+                                                 0x51, X86VRange, HasDQI>,
+      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+
+multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _,
+                                       bits<8> opc, SDNode OpNode = X86Shuf128>{
+  let Predicates = [HasAVX512] in {
+    defm Z    : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+
+  }
+  let Predicates = [HasAVX512, HasVLX] in {
+     defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+  }
+}
+
+defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2",avx512vl_f64_info, 0x23>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4",avx512vl_i32_info, 0x43>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>,
+      AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+
+multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I,
+                                                AVX512VLVectorVTInfo VTInfo_FP>{
+  defm NAME:       avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>,
+                           AVX512AIi8Base, EVEX_4V;
+  let isCodeGenOnly = 1 in {
+    defm NAME#_FP: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0x03, X86VAlign>,
+                           AVX512AIi8Base, EVEX_4V;
+  }
+}
+
+defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info, avx512vl_f32_info>,
+                                                  EVEX_CD8<32, CD8VF>;
+defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info, avx512vl_f64_info>,
+                                                  EVEX_CD8<64, CD8VF>, VEX_W;
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 331faf2fd0b4..e2fa295c0230 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -764,6 +764,14 @@ class AVX512BIi8Base : PD {
   Domain ExeDomain = SSEPackedInt;
   ImmType ImmT = Imm8;
 }
+class AVX512XSIi8Base : XS {
+  Domain ExeDomain = SSEPackedInt;
+  ImmType ImmT = Imm8;
+}
+class AVX512XDIi8Base : XD {
+  Domain ExeDomain = SSEPackedInt;
+  ImmType ImmT = Imm8;
+}
 class AVX512PSIi8Base : PS {
   Domain ExeDomain = SSEPackedSingle;
   ImmType ImmT = Imm8;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 79d213c6e1a3..dfe58ef8067b 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -35,8 +35,6 @@ def bc_mmx  : PatFrag<(ops node:$in), (x86mmx  (bitconvert node:$in))>;
 // SSE specific DAG Nodes.
 //===----------------------------------------------------------------------===//
 
-def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>,
-                                            SDTCisFP<0>, SDTCisInt<2> ]>;
 def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
                                        SDTCisFP<1>, SDTCisVT<3, i8>,
                                        SDTCisVec<1>]>;
@@ -65,7 +63,6 @@ def X86fandn   : SDNode<"X86ISD::FANDN",     SDTFPBinOp,
                         [SDNPCommutative, SDNPAssociative]>;
 def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
 def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
-def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
 def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>;
 def X86fhadd   : SDNode<"X86ISD::FHADD",     SDTFPBinOp>;
 def X86fhsub   : SDNode<"X86ISD::FHSUB",     SDTFPBinOp>;
@@ -78,6 +75,9 @@ def X86cmps    : SDNode<"X86ISD::FSETCC",     SDTX86Cmps>;
 def X86pshufb  : SDNode<"X86ISD::PSHUFB",
                  SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
+def X86psadbw  : SDNode<"X86ISD::PSADBW",
+                 SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisSameAs<0,2>]>>;
 def X86andnp   : SDNode<"X86ISD::ANDNP",
                  SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
@@ -219,6 +219,8 @@ def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
                                  SDTCisSameAs<0,1>, SDTCisInt<2>]>;
 def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                  SDTCisSameAs<0,2>, SDTCisInt<3>]>;
+def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                             SDTCisSameAs<0,2>, SDTCisInt<3>, SDTCisInt<4>]>;
 
 def SDTVBroadcast  : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
 def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>;
@@ -229,6 +231,9 @@ def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
 def SDTFPBinOpRound : SDTypeProfile<1, 3, [      // fadd_round, fmul_round, etc.
   SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisInt<3>]>;
 
+def SDTFPUnaryOpRound : SDTypeProfile<1, 2, [      // fsqrt_round, fgetexp_round, etc.
+  SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]>;
+
 def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
                            SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
 def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
@@ -247,7 +252,8 @@ def X86PShufd  : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
 def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>;
 def X86PShuflw : SDNode<"X86ISD::PSHUFLW", SDTShuff2OpI>;
 
-def X86Shufp : SDNode<"X86ISD::SHUFP", SDTShuff3OpI>;
+def X86Shufp   : SDNode<"X86ISD::SHUFP", SDTShuff3OpI>;
+def X86Shuf128 : SDNode<"X86ISD::SHUF128", SDTShuff3OpI>;
 
 def X86Movddup  : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>;
 def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>;
@@ -279,6 +285,9 @@ def X86VPermiv3   : SDNode<"X86ISD::VPERMIV3",  SDTShuff3Op>;
 
 def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
 
+def X86VFixupimm       : SDNode<"X86ISD::VFIXUPIMM", SDTFPBinOpImmRound>;
+def X86VRange          : SDNode<"X86ISD::VRANGE", SDTFPBinOpImmRound>;
+
 def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
                     SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
                                          SDTCisSubVecOfVec<1, 0>]>, []>;
@@ -298,6 +307,8 @@ def X86fmulRnd   : SDNode<"X86ISD::FMUL_RND",  SDTFPBinOpRound>;
 def X86fdivRnd   : SDNode<"X86ISD::FDIV_RND",  SDTFPBinOpRound>;
 def X86fmaxRnd   : SDNode<"X86ISD::FMAX_RND",      SDTFPBinOpRound>;
 def X86fminRnd   : SDNode<"X86ISD::FMIN_RND",      SDTFPBinOpRound>;
+def X86fsqrtRnd     : SDNode<"X86ISD::FSQRT_RND",  SDTFPUnaryOpRound>;
+def X86fgetexpRnd   : SDNode<"X86ISD::FGETEXP_RND",  SDTFPUnaryOpRound>;
 
 def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
 def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFma>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 43decf7cdda9..6b7a9299dcfb 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -433,6 +433,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
   }
 
   static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
+    { X86::BSF16rr,         X86::BSF16rm,             0 },
+    { X86::BSF32rr,         X86::BSF32rm,             0 },
+    { X86::BSF64rr,         X86::BSF64rm,             0 },
+    { X86::BSR16rr,         X86::BSR16rm,             0 },
+    { X86::BSR32rr,         X86::BSR32rm,             0 },
+    { X86::BSR64rr,         X86::BSR64rm,             0 },
     { X86::CMP16rr,         X86::CMP16rm,             0 },
     { X86::CMP32rr,         X86::CMP32rm,             0 },
     { X86::CMP64rr,         X86::CMP64rm,             0 },
@@ -1690,8 +1696,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSUBQZrr,         X86::VPSUBQZrm,           0 },
     { X86::VSHUFPDZrri,       X86::VSHUFPDZrmi,         0 },
     { X86::VSHUFPSZrri,       X86::VSHUFPSZrmi,         0 },
-    { X86::VALIGNQrri,        X86::VALIGNQrmi,          0 },
-    { X86::VALIGNDrri,        X86::VALIGNDrmi,          0 },
+    { X86::VALIGNQZrri,       X86::VALIGNQZrmi,         0 },
+    { X86::VALIGNDZrri,       X86::VALIGNDZrmi,         0 },
     { X86::VPMULUDQZrr,       X86::VPMULUDQZrm,         0 },
     { X86::VBROADCASTSSZrkz,  X86::VBROADCASTSSZmkz,    TB_NO_REVERSE },
     { X86::VBROADCASTSDZrkz,  X86::VBROADCASTSDZmkz,    TB_NO_REVERSE },
@@ -4697,8 +4703,17 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   return false;
 }
 
+static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs) {
+  unsigned NumAddrOps = MOs.size();
+  for (unsigned i = 0; i != NumAddrOps; ++i)
+    MIB.addOperand(MOs[i]);
+  if (NumAddrOps < 4) // FrameIndex only
+    addOffset(MIB, 0);
+}
+
 static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
                                      ArrayRef<MachineOperand> MOs,
+                                     MachineBasicBlock::iterator InsertPt,
                                      MachineInstr *MI,
                                      const TargetInstrInfo &TII) {
   // Create the base instruction with the memory operand as the first part.
@@ -4706,11 +4721,7 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
   MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
                                               MI->getDebugLoc(), true);
   MachineInstrBuilder MIB(MF, NewMI);
-  unsigned NumAddrOps = MOs.size();
-  for (unsigned i = 0; i != NumAddrOps; ++i)
-    MIB.addOperand(MOs[i]);
-  if (NumAddrOps < 4)  // FrameIndex only
-    addOffset(MIB, 0);
+  addOperands(MIB, MOs);
 
   // Loop over the rest of the ri operands, converting them over.
   unsigned NumOps = MI->getDesc().getNumOperands()-2;
@@ -4722,11 +4733,16 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
     MachineOperand &MO = MI->getOperand(i);
     MIB.addOperand(MO);
   }
+
+  MachineBasicBlock *MBB = InsertPt->getParent();
+  MBB->insert(InsertPt, NewMI);
+
   return MIB;
 }
 
 static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
                               unsigned OpNo, ArrayRef<MachineOperand> MOs,
+                              MachineBasicBlock::iterator InsertPt,
                               MachineInstr *MI, const TargetInstrInfo &TII) {
   // Omit the implicit operands, something BuildMI can't do.
   MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
@@ -4737,38 +4753,32 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
     MachineOperand &MO = MI->getOperand(i);
     if (i == OpNo) {
       assert(MO.isReg() && "Expected to fold into reg operand!");
-      unsigned NumAddrOps = MOs.size();
-      for (unsigned i = 0; i != NumAddrOps; ++i)
-        MIB.addOperand(MOs[i]);
-      if (NumAddrOps < 4)  // FrameIndex only
-        addOffset(MIB, 0);
+      addOperands(MIB, MOs);
     } else {
       MIB.addOperand(MO);
     }
   }
+
+  MachineBasicBlock *MBB = InsertPt->getParent();
+  MBB->insert(InsertPt, NewMI);
+
   return MIB;
 }
 
 static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
                                 ArrayRef<MachineOperand> MOs,
+                                MachineBasicBlock::iterator InsertPt,
                                 MachineInstr *MI) {
-  MachineFunction &MF = *MI->getParent()->getParent();
-  MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(Opcode));
-
-  unsigned NumAddrOps = MOs.size();
-  for (unsigned i = 0; i != NumAddrOps; ++i)
-    MIB.addOperand(MOs[i]);
-  if (NumAddrOps < 4)  // FrameIndex only
-    addOffset(MIB, 0);
+  MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
+                                    MI->getDebugLoc(), TII.get(Opcode));
+  addOperands(MIB, MOs);
   return MIB.addImm(0);
 }
 
-MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                                  MachineInstr *MI,
-                                                  unsigned OpNum,
-                                                  ArrayRef<MachineOperand> MOs,
-                                                  unsigned Size, unsigned Align,
-                                                  bool AllowCommute) const {
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
+    ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+    unsigned Size, unsigned Align, bool AllowCommute) const {
   const DenseMap<unsigned,
                  std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
   bool isCallRegIndirect = Subtarget.callRegIndirect();
@@ -4802,7 +4812,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     isTwoAddrFold = true;
   } else if (OpNum == 0) {
     if (MI->getOpcode() == X86::MOV32r0) {
-      NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
+      NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
       if (NewMI)
         return NewMI;
     }
@@ -4847,9 +4857,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       }
 
       if (isTwoAddrFold)
-        NewMI = FuseTwoAddrInst(MF, Opcode, MOs, MI, *this);
+        NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this);
       else
-        NewMI = FuseInst(MF, Opcode, OpNum, MOs, MI, *this);
+        NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
 
       if (NarrowToMOV32rm) {
         // If this is the special case where we use a MOV32rm to load a 32-bit
@@ -4901,8 +4911,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
         // Attempt to fold with the commuted version of the instruction.
         unsigned CommuteOp =
             (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1);
-        NewMI = foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, Size, Align,
-                                      /*AllowCommute=*/false);
+        NewMI =
+            foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, InsertPt, Size, Align,
+                                  /*AllowCommute=*/false);
         if (NewMI)
           return NewMI;
 
@@ -5131,10 +5142,9 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
   MI->addRegisterKilled(Reg, TRI, true);
 }
 
-MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                                  MachineInstr *MI,
-                                                  ArrayRef<unsigned> Ops,
-                                                  int FrameIndex) const {
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
   // Check switch flag
   if (NoFusing) return nullptr;
 
@@ -5173,8 +5183,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     return nullptr;
 
   return foldMemoryOperandImpl(MF, MI, Ops[0],
-                               MachineOperand::CreateFI(FrameIndex), Size,
-                               Alignment, /*AllowCommute=*/true);
+                               MachineOperand::CreateFI(FrameIndex), InsertPt,
+                               Size, Alignment, /*AllowCommute=*/true);
 }
 
 static bool isPartialRegisterLoad(const MachineInstr &LoadMI,
@@ -5196,17 +5206,16 @@ static bool isPartialRegisterLoad(const MachineInstr &LoadMI,
   return false;
 }
 
-MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                                  MachineInstr *MI,
-                                                  ArrayRef<unsigned> Ops,
-                                                  MachineInstr *LoadMI) const {
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const {
   // If loading from a FrameIndex, fold directly from the FrameIndex.
   unsigned NumOps = LoadMI->getDesc().getNumOperands();
   int FrameIndex;
   if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
     if (isPartialRegisterLoad(*LoadMI, MF))
       return nullptr;
-    return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex);
+    return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex);
   }
 
   // Check switch flag
@@ -5326,7 +5335,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     break;
   }
   }
-  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs,
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
                                /*Size=*/0, Alignment, /*AllowCommute=*/true);
 }
 
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 0dd8101bbe5b..ac1b2d4fedc6 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -307,6 +307,7 @@ public:
   /// references has been changed.
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
                                       ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
                                       int FrameIndex) const override;
 
   /// foldMemoryOperand - Same as the previous version except it allows folding
@@ -314,6 +315,7 @@ public:
   /// stack slot.
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
                                       ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
                                       MachineInstr *LoadMI) const override;
 
   /// canFoldMemoryOperand - Returns true if the specified load / store is
@@ -407,6 +409,7 @@ public:
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
                                       unsigned OpNum,
                                       ArrayRef<MachineOperand> MOs,
+                                      MachineBasicBlock::iterator InsertPt,
                                       unsigned Size, unsigned Alignment,
                                       bool AllowCommute) const;
 
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 70c2027520f9..e936b4bc466e 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -788,6 +788,7 @@ def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
 def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
+def HasMPX       : Predicate<"Subtarget->hasMPX()">;
 def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
 def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
                              AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
@@ -2456,6 +2457,9 @@ include "X86InstrAVX512.td"
 include "X86InstrMMX.td"
 include "X86Instr3DNow.td"
 
+// MPX instructions
+include "X86InstrMPX.td"
+
 include "X86InstrVMX.td"
 include "X86InstrSVM.td"
 
diff --git a/lib/Target/X86/X86InstrMPX.td b/lib/Target/X86/X86InstrMPX.td
new file mode 100644
index 000000000000..cf5e2e38fe58
--- /dev/null
+++ b/lib/Target/X86/X86InstrMPX.td
@@ -0,0 +1,70 @@
+//===-- X86InstrMPX.td - MPX Instruction Set ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MPX instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> {
+  def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins i32mem:$src),
+              OpcodeStr#" \t{$src, $dst|$dst, $src}", []>,
+              Requires<[HasMPX, Not64BitMode]>;
+  def 64rm: RI<opc, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
+              OpcodeStr#" \t{$src, $dst|$dst, $src}", []>,
+              Requires<[HasMPX, In64BitMode]>;
+}
+
+defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
+
+multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> {
+  def 32rm: I<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, i32mem:$src2),
+              OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+              Requires<[HasMPX, Not64BitMode]>;
+  def 64rm: RI<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, i64mem:$src2),
+              OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+              Requires<[HasMPX, In64BitMode]>;
+  def 32rr: I<opc, MRMSrcReg, (outs), (ins  BNDR:$src1, GR32:$src2),
+              OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+              Requires<[HasMPX, Not64BitMode]>;
+  def 64rr: RI<opc, MRMSrcReg, (outs), (ins  BNDR:$src1, GR64:$src2),
+              OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>,
+              Requires<[HasMPX, In64BitMode]>;
+}
+defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS;
+defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD;
+defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD;
+
+def BNDMOVRMrr   : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src),
+                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    Requires<[HasMPX]>;
+def BNDMOVRM32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
+                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    Requires<[HasMPX, Not64BitMode]>;
+def BNDMOVRM64rm : RI<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
+                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    Requires<[HasMPX, In64BitMode]>;
+
+def BNDMOVMRrr   : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
+                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    Requires<[HasMPX]>;
+def BNDMOVMR32mr : I<0x1B, MRMDestMem, (outs i64mem:$dst), (ins BNDR:$src),
+                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    Requires<[HasMPX, Not64BitMode]>;
+def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs i128mem:$dst), (ins BNDR:$src),
+                    "bndmov \t{$src, $dst|$dst, $src}", []>, PD,
+                    Requires<[HasMPX, In64BitMode]>;
+
+def BNDSTXmr:      I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
+                    "bndstx \t{$src, $dst|$dst, $src}", []>, TB,
+                    Requires<[HasMPX]>;
+def BNDLDXrm:      I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
+                    "bndldx \t{$src, $dst|$dst, $src}", []>, TB,
+                    Requires<[HasMPX]>;
+\ No newline at end of file
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index d3b401e8cfcb..8294e38e9957 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -3560,7 +3560,7 @@ multiclass scalar_unary_math_patterns<Intrinsic Intr, string OpcPrefix,
   let Predicates = [HasAVX] in {
     def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
               (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
-    
+
     def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))),
               (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
   }
@@ -4053,6 +4053,20 @@ defm PAVGW   : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
 defm PSADBW  : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
                                  int_x86_avx2_psad_bw, SSE_PMADD, 1>;
 
+let Predicates = [HasAVX2] in
+  def : Pat<(v32i8 (X86psadbw (v32i8 VR256:$src1),
+                              (v32i8 VR256:$src2))),
+            (VPSADBWYrr VR256:$src2, VR256:$src1)>;
+
+let Predicates = [HasAVX] in
+  def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1),
+                              (v16i8 VR128:$src2))),
+            (VPSADBWrr VR128:$src2, VR128:$src1)>;
+
+def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1),
+                            (v16i8 VR128:$src2))),
+          (PSADBWrr VR128:$src2, VR128:$src1)>;
+
 let Predicates = [HasAVX] in
 defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
                               loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
@@ -4207,16 +4221,6 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in {
 }
 } // Constraints = "$src1 = $dst"
 
-let Predicates = [HasAVX] in {
-  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
-            (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
-}
-
-let Predicates = [UseSSE2] in {
-  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
-            (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
-}
-
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Comparison Instructions
 //===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 4af514a83ca5..0268066c2ba1 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -21,7 +21,8 @@ enum IntrinsicType {
   GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,
   INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP,
   CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI,
-  INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, FMA_OP_MASK,
+  INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK,
+  INTR_TYPE_3OP_MASK, FMA_OP_MASK,
   INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
   EXPAND_FROM_MEM, BLEND
 };
@@ -339,9 +340,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
                      X86ISD::FDIV_RND),
   X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FDIV,
-  X86ISD::FDIV_RND),
+                     X86ISD::FDIV_RND),
   X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FDIV,
-  X86ISD::FDIV_RND),
+                     X86ISD::FDIV_RND),
   X86_INTRINSIC_DATA(avx512_mask_expand_d_128,  COMPRESS_EXPAND_IN_REG,
                      X86ISD::EXPAND, 0),
   X86_INTRINSIC_DATA(avx512_mask_expand_d_256,  COMPRESS_EXPAND_IN_REG,
@@ -366,6 +367,18 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::EXPAND, 0),
   X86_INTRINSIC_DATA(avx512_mask_expand_q_512,  COMPRESS_EXPAND_IN_REG,
                      X86ISD::EXPAND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_pd_512, INTR_TYPE_1OP_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_ps_128, INTR_TYPE_1OP_MASK_RM,
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_ps_256, INTR_TYPE_1OP_MASK_RM, 
+                     X86ISD::FGETEXP_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM, 
+                     X86ISD::FGETEXP_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
   X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
   X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
@@ -559,6 +572,14 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::RNDSCALE, 0),
   X86_INTRINSIC_DATA(avx512_mask_rndscale_ss,   INTR_TYPE_SCALAR_MASK_RM,
                      X86ISD::RNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT,
+                     X86ISD::FSQRT_RND),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT,
+                     X86ISD::FSQRT_RND),
   X86_INTRINSIC_DATA(avx512_mask_sub_pd_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask_sub_pd_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0),
   X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
@@ -583,6 +604,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
   X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
   X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0),
   X86_INTRINSIC_DATA(avx512_mask_xor_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_xor_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_xor_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 556b518936f3..ff1436af4ece 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -128,6 +128,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
   const DataLayout *DL = TM.getDataLayout();
   assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference");
 
+  MCSymbol *Sym = nullptr;
   SmallString<128> Name;
   StringRef Suffix;
 
@@ -160,12 +161,14 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
     else
       getMang()->getNameWithPrefix(Name, MO.getSymbolName());
   } else if (MO.isMBB()) {
-    Name += MO.getMBB()->getSymbol()->getName();
+    assert(Suffix.empty());
+    Sym = MO.getMBB()->getSymbol();
   }
   unsigned OrigLen = Name.size() - PrefixLen;
 
   Name += Suffix;
-  MCSymbol *Sym = Ctx.getOrCreateSymbol(Name);
+  if (!Sym)
+    Sym = Ctx.getOrCreateSymbol(Name);
 
   StringRef OrigName = StringRef(Name).substr(PrefixLen, OrigLen);
 
@@ -240,10 +243,10 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
 
   case X86II::MO_TLVP:      RefKind = MCSymbolRefExpr::VK_TLVP; break;
   case X86II::MO_TLVP_PIC_BASE:
-    Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
+    Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
     // Subtract the pic base.
-    Expr = MCBinaryExpr::CreateSub(Expr,
-                                  MCSymbolRefExpr::Create(MF.getPICBaseSymbol(),
+    Expr = MCBinaryExpr::createSub(Expr,
+                                  MCSymbolRefExpr::create(MF.getPICBaseSymbol(),
                                                            Ctx),
                                    Ctx);
     break;
@@ -264,10 +267,10 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   case X86II::MO_PIC_BASE_OFFSET:
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
   case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
-    Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+    Expr = MCSymbolRefExpr::create(Sym, Ctx);
     // Subtract the pic base.
-    Expr = MCBinaryExpr::CreateSub(Expr,
-                            MCSymbolRefExpr::Create(MF.getPICBaseSymbol(), Ctx),
+    Expr = MCBinaryExpr::createSub(Expr,
+                            MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx),
                                    Ctx);
     if (MO.isJTI()) {
       assert(MAI.doesSetDirectiveSuppressesReloc());
@@ -277,17 +280,17 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
       // section so we are restricting it to jumptable references.
       MCSymbol *Label = Ctx.createTempSymbol();
       AsmPrinter.OutStreamer->EmitAssignment(Label, Expr);
-      Expr = MCSymbolRefExpr::Create(Label, Ctx);
+      Expr = MCSymbolRefExpr::create(Label, Ctx);
     }
     break;
   }
 
   if (!Expr)
-    Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
+    Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);
 
   if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
-    Expr = MCBinaryExpr::CreateAdd(Expr,
-                                   MCConstantExpr::Create(MO.getOffset(), Ctx),
+    Expr = MCBinaryExpr::createAdd(Expr,
+                                   MCConstantExpr::create(MO.getOffset(), Ctx),
                                    Ctx);
   return MCOperand::createExpr(Expr);
 }
@@ -710,7 +713,7 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
   }
 
   MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3));
-  const MCSymbolRefExpr *symRef = MCSymbolRefExpr::Create(sym, SRVK, context);
+  const MCSymbolRefExpr *symRef = MCSymbolRefExpr::create(sym, SRVK, context);
 
   MCInst LEA;
   if (is64Bits) {
@@ -749,7 +752,7 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
   StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr";
   MCSymbol *tlsGetAddr = context.getOrCreateSymbol(name);
   const MCSymbolRefExpr *tlsRef =
-    MCSymbolRefExpr::Create(tlsGetAddr,
+    MCSymbolRefExpr::create(tlsGetAddr,
                             MCSymbolRefExpr::VK_PLT,
                             context);
 
@@ -1071,7 +1074,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // FIXME: We would like an efficient form for this, so we don't have to do a
     // lot of extra uniquing.
     EmitAndCountInstruction(MCInstBuilder(X86::CALLpcrel32)
-      .addExpr(MCSymbolRefExpr::Create(PICBase, OutContext)));
+      .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
 
     // Emit the label.
     OutStreamer->EmitLabel(PICBase);
@@ -1100,12 +1103,12 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // Now that we have emitted the label, lower the complex operand expression.
     MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
 
-    const MCExpr *DotExpr = MCSymbolRefExpr::Create(DotSym, OutContext);
+    const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
     const MCExpr *PICBase =
-      MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), OutContext);
-    DotExpr = MCBinaryExpr::CreateSub(DotExpr, PICBase, OutContext);
+      MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
+    DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext);
 
-    DotExpr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(OpSym,OutContext),
+    DotExpr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(OpSym,OutContext),
                                       DotExpr, OutContext);
 
     EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri)
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 1f361631a0b7..e9b6bfc3273c 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -175,12 +175,12 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
       return &X86::GR64_NOSPRegClass;
     return &X86::GR32_NOSPRegClass;
   case 2: // Available for tailcall (not callee-saved GPRs).
-    if (IsWin64)
+    const Function *F = MF.getFunction();
+    if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64))
       return &X86::GR64_TCW64RegClass;
     else if (Is64Bit)
       return &X86::GR64_TCRegClass;
 
-    const Function *F = MF.getFunction();
     bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false);
     if (hasHipeCC)
       return &X86::GR32RegClass;
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 2e735fa3c026..cdb151c26a05 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -302,6 +302,11 @@ def CR15 : X86Reg<"cr15", 15>;
 def EIZ : X86Reg<"eiz", 4>;
 def RIZ : X86Reg<"riz", 4>;
 
+// Bound registers, used in MPX instructions
+def BND0 : X86Reg<"bnd0",   0>;
+def BND1 : X86Reg<"bnd1",   1>;
+def BND2 : X86Reg<"bnd2",   2>;
+def BND3 : X86Reg<"bnd3",   3>;
 
 //===----------------------------------------------------------------------===//
 // Register Class Definitions... now that we have all of the pieces, define the
@@ -484,3 +489,6 @@ def VK8WM   : RegisterClass<"X86", [v8i1],  8,  (sub VK8, K0)> {let Size = 8;}
 def VK16WM  : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>   {let Size = 16;}
 def VK32WM  : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;}
 def VK64WM  : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
+
+// Bound registers
+def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
+\ No newline at end of file
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 1cdab14e034e..74af29f4566c 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -259,6 +259,7 @@ void X86Subtarget::initializeEnvironment() {
   HasSHA = false;
   HasPRFCHW = false;
   HasRDSEED = false;
+  HasMPX = false;
   IsBTMemSlow = false;
   IsSHLDSlow = false;
   IsUAMemFast = false;
@@ -273,8 +274,6 @@ void X86Subtarget::initializeEnvironment() {
   LEAUsesAG = false;
   SlowLEA = false;
   SlowIncDec = false;
-  UseSqrtEst = false;
-  UseReciprocalEst = false;
   stackAlignment = 4;
   // FIXME: this is a known good value for Yonah. How about others?
   MaxInlineSizeThreshold = 128;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 455dd7744d73..a476f7aba932 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -190,16 +190,6 @@ protected:
   /// True if INC and DEC instructions are slow when writing to flags
   bool SlowIncDec;
 
-  /// Use the RSQRT* instructions to optimize square root calculations.
-  /// For this to be profitable, the cost of FSQRT and FDIV must be
-  /// substantially higher than normal FP ops like FADD and FMUL.
-  bool UseSqrtEst;
-
-  /// Use the RCP* instructions to optimize FP division calculations.
-  /// For this to be profitable, the cost of FDIV must be
-  /// substantially higher than normal FP ops like FADD and FMUL.
-  bool UseReciprocalEst;
-
   /// Processor has AVX-512 PreFetch Instructions
   bool HasPFI;
 
@@ -218,6 +208,9 @@ protected:
   /// Processor has AVX-512 Vector Length eXtenstions
   bool HasVLX;
 
+  /// Processot supports MPX - Memory Protection Extensions
+  bool HasMPX;
+
   /// Use software floating point for code generation.
   bool UseSoftFloat;
 
@@ -377,14 +370,13 @@ public:
   bool LEAusesAG() const { return LEAUsesAG; }
   bool slowLEA() const { return SlowLEA; }
   bool slowIncDec() const { return SlowIncDec; }
-  bool useSqrtEst() const { return UseSqrtEst; }
-  bool useReciprocalEst() const { return UseReciprocalEst; }
   bool hasCDI() const { return HasCDI; }
   bool hasPFI() const { return HasPFI; }
   bool hasERI() const { return HasERI; }
   bool hasDQI() const { return HasDQI; }
   bool hasBWI() const { return HasBWI; }
   bool hasVLX() const { return HasVLX; }
+  bool hasMPX() const { return HasMPX; }
 
   bool isAtom() const { return X86ProcFamily == IntelAtom; }
   bool isSLM() const { return X86ProcFamily == IntelSLM; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 3e5f1d82202f..646cff7c5bdb 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -105,6 +105,13 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU,
   if (Subtarget.isTargetWin64())
     this->Options.TrapUnreachable = true;
 
+  // TODO: By default, all reciprocal estimate operations are off because
+  // that matches the behavior before TargetRecip was added (except for btver2
+  // which used subtarget features to enable this type of codegen).
+  // We should change this to match GCC behavior where everything but
+  // scalar division estimates are turned on by default with -ffast-math.
+  this->Options.Reciprocals.setDefaults("all", false, 1);
+
   initAsmInfo();
 }
 
@@ -221,9 +228,9 @@ bool X86PassConfig::addILPOpts() {
 }
 
 bool X86PassConfig::addPreISel() {
-  // Only add this pass for 32-bit x86.
+  // Only add this pass for 32-bit x86 Windows.
   Triple TT(TM->getTargetTriple());
-  if (TT.getArch() == Triple::x86)
+  if (TT.isOSWindows() && TT.getArch() == Triple::x86)
     addPass(createX86WinEHStatePass());
   return true;
 }
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 6bf45c37e38b..f9f62904b64b 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -32,9 +32,9 @@ const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference(
   if ((Encoding & DW_EH_PE_indirect) && (Encoding & DW_EH_PE_pcrel)) {
     const MCSymbol *Sym = TM.getSymbol(GV, Mang);
     const MCExpr *Res =
-      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
-    const MCExpr *Four = MCConstantExpr::Create(4, getContext());
-    return MCBinaryExpr::CreateAdd(Res, Four, getContext());
+      MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
+    const MCExpr *Four = MCConstantExpr::create(4, getContext());
+    return MCBinaryExpr::createAdd(Res, Four, getContext());
   }
 
   return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
@@ -55,14 +55,14 @@ const MCExpr *X86_64MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
   // foo@GOTPCREL+4+<offset>.
   unsigned FinalOff = Offset+MV.getConstant()+4;
   const MCExpr *Res =
-    MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
-  const MCExpr *Off = MCConstantExpr::Create(FinalOff, getContext());
-  return MCBinaryExpr::CreateAdd(Res, Off, getContext());
+    MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
+  const MCExpr *Off = MCConstantExpr::create(FinalOff, getContext());
+  return MCBinaryExpr::createAdd(Res, Off, getContext());
 }
 
 const MCExpr *X86ELFTargetObjectFile::getDebugThreadLocalSymbol(
     const MCSymbol *Sym) const {
-  return MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext());
+  return MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext());
 }
 
 void
@@ -116,7 +116,7 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol(
   if (GOLHS->isThreadLocal())
     return nullptr;
 
-  return MCSymbolRefExpr::Create(TM.getSymbol(GOLHS, Mang),
+  return MCSymbolRefExpr::create(TM.getSymbol(GOLHS, Mang),
                                  MCSymbolRefExpr::VK_COFF_IMGREL32,
                                  getContext());
 }
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index 4efaada40926..ce69ea721993 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -16,6 +16,7 @@
 
 #include "X86.h"
 #include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/Dominators.h"
@@ -59,30 +60,49 @@ public:
 private:
   void emitExceptionRegistrationRecord(Function *F);
 
-  void linkExceptionRegistration(IRBuilder<> &Builder, Value *RegNode,
-                                 Value *Handler);
-  void unlinkExceptionRegistration(IRBuilder<> &Builder, Value *RegNode);
+  void linkExceptionRegistration(IRBuilder<> &Builder, Value *Handler);
+  void unlinkExceptionRegistration(IRBuilder<> &Builder);
+  void addCXXStateStores(Function &F, MachineModuleInfo &MMI);
+  void addCXXStateStoresToFunclet(Value *ParentRegNode, WinEHFuncInfo &FuncInfo,
+                                  Function &F, int BaseState);
+  void insertStateNumberStore(Value *ParentRegNode, Instruction *IP, int State);
 
   Value *emitEHLSDA(IRBuilder<> &Builder, Function *F);
 
   Function *generateLSDAInEAXThunk(Function *ParentFunc);
 
+  int escapeRegNode(Function &F);
+
   // Module-level type getters.
-  Type *getEHRegistrationType();
-  Type *getSEH3RegistrationType();
-  Type *getSEH4RegistrationType();
-  Type *getCXXEH3RegistrationType();
+  Type *getEHLinkRegistrationType();
+  Type *getSEHRegistrationType();
+  Type *getCXXEHRegistrationType();
 
   // Per-module data.
   Module *TheModule = nullptr;
-  StructType *EHRegistrationTy = nullptr;
-  StructType *CXXEH3RegistrationTy = nullptr;
-  StructType *SEH3RegistrationTy = nullptr;
-  StructType *SEH4RegistrationTy = nullptr;
+  StructType *EHLinkRegistrationTy = nullptr;
+  StructType *CXXEHRegistrationTy = nullptr;
+  StructType *SEHRegistrationTy = nullptr;
+  Function *FrameRecover = nullptr;
+  Function *FrameAddress = nullptr;
+  Function *FrameEscape = nullptr;
 
   // Per-function state
   EHPersonality Personality = EHPersonality::Unknown;
   Function *PersonalityFn = nullptr;
+
+  /// The stack allocation containing all EH data, including the link in the
+  /// fs:00 chain and the current state.
+  AllocaInst *RegNode = nullptr;
+
+  /// Struct type of RegNode. Used for GEPing.
+  Type *RegNodeTy = nullptr;
+
+  /// The index of the state field of RegNode.
+  int StateFieldIndex = ~0U;
+
+  /// The linked list node subobject inside of RegNode.
+  Value *Link = nullptr;
 };
 }
 
@@ -92,16 +112,21 @@ char WinEHStatePass::ID = 0;
 
 bool WinEHStatePass::doInitialization(Module &M) {
   TheModule = &M;
+  FrameEscape = Intrinsic::getDeclaration(TheModule, Intrinsic::frameescape);
+  FrameRecover = Intrinsic::getDeclaration(TheModule, Intrinsic::framerecover);
+  FrameAddress = Intrinsic::getDeclaration(TheModule, Intrinsic::frameaddress);
   return false;
 }
 
 bool WinEHStatePass::doFinalization(Module &M) {
   assert(TheModule == &M);
   TheModule = nullptr;
-  EHRegistrationTy = nullptr;
-  CXXEH3RegistrationTy = nullptr;
-  SEH3RegistrationTy = nullptr;
-  SEH4RegistrationTy = nullptr;
+  EHLinkRegistrationTy = nullptr;
+  CXXEHRegistrationTy = nullptr;
+  SEHRegistrationTy = nullptr;
+  FrameEscape = nullptr;
+  FrameRecover = nullptr;
+  FrameAddress = nullptr;
   return false;
 }
 
@@ -136,8 +161,19 @@ bool WinEHStatePass::runOnFunction(Function &F) {
   if (!isMSVCEHPersonality(Personality))
     return false;
 
+  // Disable frame pointer elimination in this function.
+  // FIXME: Do the nested handlers need to keep the parent ebp in ebp, or can we
+  // use an arbitrary register?
+  F.addFnAttr("no-frame-pointer-elim", "true");
+
   emitExceptionRegistrationRecord(&F);
-  // FIXME: State insertion.
+
+  auto *MMIPtr = getAnalysisIfAvailable<MachineModuleInfo>();
+  assert(MMIPtr && "MachineModuleInfo should always be available");
+  MachineModuleInfo &MMI = *MMIPtr;
+  if (Personality == EHPersonality::MSVC_CXX) {
+    addCXXStateStores(F, MMI);
+  }
 
   // Reset per-function state.
   PersonalityFn = nullptr;
@@ -152,17 +188,17 @@ bool WinEHStatePass::runOnFunction(Function &F) {
 ///     EHRegistrationNode *Next;
 ///     PEXCEPTION_ROUTINE Handler;
 ///   };
-Type *WinEHStatePass::getEHRegistrationType() {
-  if (EHRegistrationTy)
-    return EHRegistrationTy;
+Type *WinEHStatePass::getEHLinkRegistrationType() {
+  if (EHLinkRegistrationTy)
+    return EHLinkRegistrationTy;
   LLVMContext &Context = TheModule->getContext();
-  EHRegistrationTy = StructType::create(Context, "EHRegistrationNode");
+  EHLinkRegistrationTy = StructType::create(Context, "EHRegistrationNode");
   Type *FieldTys[] = {
-      EHRegistrationTy->getPointerTo(0), // EHRegistrationNode *Next
+      EHLinkRegistrationTy->getPointerTo(0), // EHRegistrationNode *Next
       Type::getInt8PtrTy(Context) // EXCEPTION_DISPOSITION (*Handler)(...)
   };
-  EHRegistrationTy->setBody(FieldTys, false);
-  return EHRegistrationTy;
+  EHLinkRegistrationTy->setBody(FieldTys, false);
+  return EHLinkRegistrationTy;
 }
 
 /// The __CxxFrameHandler3 registration node:
@@ -171,40 +207,21 @@ Type *WinEHStatePass::getEHRegistrationType() {
 ///     EHRegistrationNode SubRecord;
 ///     int32_t TryLevel;
 ///   };
-Type *WinEHStatePass::getCXXEH3RegistrationType() {
-  if (CXXEH3RegistrationTy)
-    return CXXEH3RegistrationTy;
+Type *WinEHStatePass::getCXXEHRegistrationType() {
+  if (CXXEHRegistrationTy)
+    return CXXEHRegistrationTy;
   LLVMContext &Context = TheModule->getContext();
   Type *FieldTys[] = {
       Type::getInt8PtrTy(Context), // void *SavedESP
-      getEHRegistrationType(),     // EHRegistrationNode SubRecord
+      getEHLinkRegistrationType(), // EHRegistrationNode SubRecord
       Type::getInt32Ty(Context)    // int32_t TryLevel
   };
-  CXXEH3RegistrationTy =
+  CXXEHRegistrationTy =
       StructType::create(FieldTys, "CXXExceptionRegistration");
-  return CXXEH3RegistrationTy;
-}
-
-/// The _except_handler3 registration node:
-///   struct EH3ExceptionRegistration {
-///     EHRegistrationNode SubRecord;
-///     void *ScopeTable;
-///     int32_t TryLevel;
-///   };
-Type *WinEHStatePass::getSEH3RegistrationType() {
-  if (SEH3RegistrationTy)
-    return SEH3RegistrationTy;
-  LLVMContext &Context = TheModule->getContext();
-  Type *FieldTys[] = {
-      getEHRegistrationType(),     // EHRegistrationNode SubRecord
-      Type::getInt8PtrTy(Context), // void *ScopeTable
-      Type::getInt32Ty(Context)    // int32_t TryLevel
-  };
-  SEH3RegistrationTy = StructType::create(FieldTys, "EH3ExceptionRegistration");
-  return SEH3RegistrationTy;
+  return CXXEHRegistrationTy;
 }
 
-/// The _except_handler4 registration node:
+/// The _except_handler3/4 registration node:
 ///   struct EH4ExceptionRegistration {
 ///     void *SavedESP;
 ///     _EXCEPTION_POINTERS *ExceptionPointers;
@@ -212,19 +229,19 @@ Type *WinEHStatePass::getSEH3RegistrationType() {
 ///     int32_t EncodedScopeTable;
 ///     int32_t TryLevel;
 ///   };
-Type *WinEHStatePass::getSEH4RegistrationType() {
-  if (SEH4RegistrationTy)
-    return SEH4RegistrationTy;
+Type *WinEHStatePass::getSEHRegistrationType() {
+  if (SEHRegistrationTy)
+    return SEHRegistrationTy;
   LLVMContext &Context = TheModule->getContext();
   Type *FieldTys[] = {
       Type::getInt8PtrTy(Context), // void *SavedESP
       Type::getInt8PtrTy(Context), // void *ExceptionPointers
-      getEHRegistrationType(),     // EHRegistrationNode SubRecord
+      getEHLinkRegistrationType(), // EHRegistrationNode SubRecord
       Type::getInt32Ty(Context),   // int32_t EncodedScopeTable
       Type::getInt32Ty(Context)    // int32_t TryLevel
   };
-  SEH4RegistrationTy = StructType::create(FieldTys, "EH4ExceptionRegistration");
-  return SEH4RegistrationTy;
+  SEHRegistrationTy = StructType::create(FieldTys, "SEHExceptionRegistration");
+  return SEHRegistrationTy;
 }
 
 // Emit an exception registration record. These are stack allocations with the
@@ -238,62 +255,63 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
   StringRef PersonalityName = PersonalityFn->getName();
   IRBuilder<> Builder(&F->getEntryBlock(), F->getEntryBlock().begin());
   Type *Int8PtrType = Builder.getInt8PtrTy();
-  Value *SubRecord = nullptr;
-  if (PersonalityName == "__CxxFrameHandler3") {
-    Type *RegNodeTy = getCXXEH3RegistrationType();
-    Value *RegNode = Builder.CreateAlloca(RegNodeTy);
+  if (Personality == EHPersonality::MSVC_CXX) {
+    RegNodeTy = getCXXEHRegistrationType();
+    RegNode = Builder.CreateAlloca(RegNodeTy);
     // FIXME: We can skip this in -GS- mode, when we figure that out.
     // SavedESP = llvm.stacksave()
     Value *SP = Builder.CreateCall(
         Intrinsic::getDeclaration(TheModule, Intrinsic::stacksave), {});
     Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
     // TryLevel = -1
-    Builder.CreateStore(Builder.getInt32(-1),
-                        Builder.CreateStructGEP(RegNodeTy, RegNode, 2));
+    StateFieldIndex = 2;
+    insertStateNumberStore(RegNode, Builder.GetInsertPoint(), -1);
     // Handler = __ehhandler$F
     Function *Trampoline = generateLSDAInEAXThunk(F);
-    SubRecord = Builder.CreateStructGEP(RegNodeTy, RegNode, 1);
-    linkExceptionRegistration(Builder, SubRecord, Trampoline);
-  } else if (PersonalityName == "_except_handler3") {
-    Type *RegNodeTy = getSEH3RegistrationType();
-    Value *RegNode = Builder.CreateAlloca(RegNodeTy);
-    // TryLevel = -1
-    Builder.CreateStore(Builder.getInt32(-1),
-                        Builder.CreateStructGEP(RegNodeTy, RegNode, 2));
-    // ScopeTable = llvm.x86.seh.lsda(F)
-    Value *LSDA = emitEHLSDA(Builder, F);
-    Builder.CreateStore(LSDA, Builder.CreateStructGEP(RegNodeTy, RegNode, 1));
-    SubRecord = Builder.CreateStructGEP(RegNodeTy, RegNode, 0);
-    linkExceptionRegistration(Builder, SubRecord, PersonalityFn);
-  } else if (PersonalityName == "_except_handler4") {
-    Type *RegNodeTy = getSEH4RegistrationType();
-    Value *RegNode = Builder.CreateAlloca(RegNodeTy);
+    Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 1);
+    linkExceptionRegistration(Builder, Trampoline);
+  } else if (Personality == EHPersonality::MSVC_X86SEH) {
+    // If _except_handler4 is in use, some additional guard checks and prologue
+    // stuff is required.
+    bool UseStackGuard = (PersonalityName == "_except_handler4");
+    RegNodeTy = getSEHRegistrationType();
+    RegNode = Builder.CreateAlloca(RegNodeTy);
     // SavedESP = llvm.stacksave()
     Value *SP = Builder.CreateCall(
         Intrinsic::getDeclaration(TheModule, Intrinsic::stacksave), {});
     Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
-    // TryLevel = -2
-    Builder.CreateStore(Builder.getInt32(-2),
-                        Builder.CreateStructGEP(RegNodeTy, RegNode, 4));
-    // FIXME: XOR the LSDA with __security_cookie.
+    // TryLevel = -2 / -1
+    StateFieldIndex = 4;
+    insertStateNumberStore(RegNode, Builder.GetInsertPoint(),
+                           UseStackGuard ? -2 : -1);
     // ScopeTable = llvm.x86.seh.lsda(F)
     Value *FI8 = Builder.CreateBitCast(F, Int8PtrType);
     Value *LSDA = Builder.CreateCall(
         Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_lsda), FI8);
-    Builder.CreateStore(LSDA, Builder.CreateStructGEP(RegNodeTy, RegNode, 1));
-    SubRecord = Builder.CreateStructGEP(RegNodeTy, RegNode, 2);
-    linkExceptionRegistration(Builder, SubRecord, PersonalityFn);
+    Type *Int32Ty = Type::getInt32Ty(TheModule->getContext());
+    LSDA = Builder.CreatePtrToInt(LSDA, Int32Ty);
+    // If using _except_handler4, xor the address of the table with
+    // __security_cookie.
+    if (UseStackGuard) {
+      Value *Cookie =
+          TheModule->getOrInsertGlobal("__security_cookie", Int32Ty);
+      Value *Val = Builder.CreateLoad(Int32Ty, Cookie);
+      LSDA = Builder.CreateXor(LSDA, Val);
+    }
+    Builder.CreateStore(LSDA, Builder.CreateStructGEP(RegNodeTy, RegNode, 3));
+    Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 2);
+    linkExceptionRegistration(Builder, PersonalityFn);
   } else {
     llvm_unreachable("unexpected personality function");
   }
 
-  // FIXME: Insert an unlink before all returns.
+  // Insert an unlink before all returns.
   for (BasicBlock &BB : *F) {
     TerminatorInst *T = BB.getTerminator();
     if (!isa<ReturnInst>(T))
       continue;
     Builder.SetInsertPoint(T);
-    unlinkExceptionRegistration(Builder, SubRecord);
+    unlinkExceptionRegistration(Builder);
   }
 }
 
@@ -342,33 +360,122 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) {
 }
 
 void WinEHStatePass::linkExceptionRegistration(IRBuilder<> &Builder,
-                                               Value *RegNode, Value *Handler) {
-  Type *RegNodeTy = getEHRegistrationType();
+                                               Value *Handler) {
+  Type *LinkTy = getEHLinkRegistrationType();
   // Handler = Handler
   Handler = Builder.CreateBitCast(Handler, Builder.getInt8PtrTy());
-  Builder.CreateStore(Handler, Builder.CreateStructGEP(RegNodeTy, RegNode, 1));
+  Builder.CreateStore(Handler, Builder.CreateStructGEP(LinkTy, Link, 1));
   // Next = [fs:00]
   Constant *FSZero =
-      Constant::getNullValue(RegNodeTy->getPointerTo()->getPointerTo(257));
+      Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257));
   Value *Next = Builder.CreateLoad(FSZero);
-  Builder.CreateStore(Next, Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
-  // [fs:00] = RegNode
-  Builder.CreateStore(RegNode, FSZero);
+  Builder.CreateStore(Next, Builder.CreateStructGEP(LinkTy, Link, 0));
+  // [fs:00] = Link
+  Builder.CreateStore(Link, FSZero);
 }
 
-void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder,
-                                                 Value *RegNode) {
-  // Clone RegNode into the current BB for better address mode folding.
-  if (auto *GEP = dyn_cast<GetElementPtrInst>(RegNode)) {
+void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) {
+  // Clone Link into the current BB for better address mode folding.
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(Link)) {
     GEP = cast<GetElementPtrInst>(GEP->clone());
     Builder.Insert(GEP);
-    RegNode = GEP;
+    Link = GEP;
   }
-  Type *RegNodeTy = getEHRegistrationType();
-  // [fs:00] = RegNode->Next
+  Type *LinkTy = getEHLinkRegistrationType();
+  // [fs:00] = Link->Next
   Value *Next =
-      Builder.CreateLoad(Builder.CreateStructGEP(RegNodeTy, RegNode, 0));
+      Builder.CreateLoad(Builder.CreateStructGEP(LinkTy, Link, 0));
   Constant *FSZero =
-      Constant::getNullValue(RegNodeTy->getPointerTo()->getPointerTo(257));
+      Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257));
   Builder.CreateStore(Next, FSZero);
 }
+
+void WinEHStatePass::addCXXStateStores(Function &F, MachineModuleInfo &MMI) {
+  WinEHFuncInfo &FuncInfo = MMI.getWinEHFuncInfo(&F);
+  calculateWinCXXEHStateNumbers(&F, FuncInfo);
+
+  // The base state for the parent is -1.
+  addCXXStateStoresToFunclet(RegNode, FuncInfo, F, -1);
+
+  // Set up RegNodeEscapeIndex
+  int RegNodeEscapeIndex = escapeRegNode(F);
+
+  // Only insert stores in catch handlers.
+  Constant *FI8 =
+      ConstantExpr::getBitCast(&F, Type::getInt8PtrTy(TheModule->getContext()));
+  for (auto P : FuncInfo.HandlerBaseState) {
+    Function *Handler = const_cast<Function *>(P.first);
+    int BaseState = P.second;
+    IRBuilder<> Builder(&Handler->getEntryBlock(),
+                        Handler->getEntryBlock().begin());
+    // FIXME: Find and reuse such a call if present.
+    Value *ParentFP = Builder.CreateCall(FrameAddress, {Builder.getInt32(1)});
+    Value *RecoveredRegNode = Builder.CreateCall(
+        FrameRecover, {FI8, ParentFP, Builder.getInt32(RegNodeEscapeIndex)});
+    RecoveredRegNode =
+        Builder.CreateBitCast(RecoveredRegNode, RegNodeTy->getPointerTo(0));
+    addCXXStateStoresToFunclet(RecoveredRegNode, FuncInfo, *Handler, BaseState);
+  }
+}
+
+/// Escape RegNode so that we can access it from child handlers. Find the call
+/// to frameescape, if any, in the entry block and append RegNode to the list
+/// of arguments.
+int WinEHStatePass::escapeRegNode(Function &F) {
+  // Find the call to frameescape and extract its arguments.
+  IntrinsicInst *EscapeCall = nullptr;
+  for (Instruction &I : F.getEntryBlock()) {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+    if (II && II->getIntrinsicID() == Intrinsic::frameescape) {
+      EscapeCall = II;
+      break;
+    }
+  }
+  SmallVector<Value *, 8> Args;
+  if (EscapeCall) {
+    auto Ops = EscapeCall->arg_operands();
+    Args.append(Ops.begin(), Ops.end());
+  }
+  Args.push_back(RegNode);
+
+  // Replace the call (if it exists) with new one. Otherwise, insert at the end
+  // of the entry block.
+  IRBuilder<> Builder(&F.getEntryBlock(),
+                      EscapeCall ? EscapeCall : F.getEntryBlock().end());
+  Builder.CreateCall(FrameEscape, Args);
+  if (EscapeCall)
+    EscapeCall->eraseFromParent();
+  return Args.size() - 1;
+}
+
+void WinEHStatePass::addCXXStateStoresToFunclet(Value *ParentRegNode,
+                                                WinEHFuncInfo &FuncInfo,
+                                                Function &F, int BaseState) {
+  // Iterate all the instructions and emit state number stores.
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      if (auto *CI = dyn_cast<CallInst>(&I)) {
+        // Possibly throwing call instructions have no actions to take after
+        // an unwind. Ensure they are in the -1 state.
+        if (CI->doesNotThrow())
+          continue;
+        insertStateNumberStore(ParentRegNode, CI, BaseState);
+      } else if (auto *II = dyn_cast<InvokeInst>(&I)) {
+        // Look up the state number of the landingpad this unwinds to.
+        LandingPadInst *LPI = II->getUnwindDest()->getLandingPadInst();
+        // FIXME: Why does this assertion fail?
+        //assert(FuncInfo.LandingPadStateMap.count(LPI) && "LP has no state!");
+        int State = FuncInfo.LandingPadStateMap[LPI];
+        insertStateNumberStore(ParentRegNode, II, State);
+      }
+    }
+  }
+}
+
+void WinEHStatePass::insertStateNumberStore(Value *ParentRegNode,
+                                            Instruction *IP, int State) {
+  IRBuilder<> Builder(IP);
+  Value *StateField =
+      Builder.CreateStructGEP(RegNodeTy, ParentRegNode, StateFieldIndex);
+  Builder.CreateStore(Builder.getInt32(State), StateField);
+}
diff --git a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
index 36b3b02a707a..500c84d2a418 100644
--- a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
+++ b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
@@ -45,7 +45,8 @@ printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O) {
   report_fatal_error("can't handle InlineJT32");
 }
 
-static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
+static void printExpr(const MCExpr *Expr, const MCAsmInfo *MAI,
+                      raw_ostream &OS) {
   int Offset = 0;
   const MCSymbolRefExpr *SRE;
 
@@ -60,7 +61,7 @@ static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
   }
   assert(SRE->getKind() == MCSymbolRefExpr::VK_None);
 
-  OS << SRE->getSymbol();
+  SRE->getSymbol().print(OS, MAI);
 
   if (Offset) {
     if (Offset > 0)
@@ -83,5 +84,5 @@ printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
   }
 
   assert(Op.isExpr() && "unknown operand kind in printOperand");
-  printExpr(Op.getExpr(), O);
+  printExpr(Op.getExpr(), &MAI, O);
 }
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
index f2d2b37d6f21..3178a4edbb3b 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
@@ -8,12 +8,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "XCoreMCAsmInfo.h"
-#include "llvm/ADT/StringRef.h"
 using namespace llvm;
 
 void XCoreMCAsmInfo::anchor() { }
 
-XCoreMCAsmInfo::XCoreMCAsmInfo(StringRef TT) {
+XCoreMCAsmInfo::XCoreMCAsmInfo(const Triple &TT) {
   SupportsDebugInformation = true;
   Data16bitsDirective = "\t.short\t";
   Data32bitsDirective = "\t.long\t";
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
index 26df211eecee..39581e424e8c 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
@@ -17,14 +17,14 @@
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
-  class StringRef;
-  class Target;
+class Triple;
 
-  class XCoreMCAsmInfo : public MCAsmInfoELF {
-    void anchor() override;
-  public:
-    explicit XCoreMCAsmInfo(StringRef TT);
-  };
+class XCoreMCAsmInfo : public MCAsmInfoELF {
+  void anchor() override;
+
+public:
+  explicit XCoreMCAsmInfo(const Triple &TT);
+};
 
 } // namespace llvm
 
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index ce0d39fe407f..f0e459620c9c 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -54,7 +54,7 @@ static MCSubtargetInfo *createXCoreMCSubtargetInfo(StringRef TT, StringRef CPU,
 }
 
 static MCAsmInfo *createXCoreMCAsmInfo(const MCRegisterInfo &MRI,
-                                       StringRef TT) {
+                                       const Triple &TT) {
   MCAsmInfo *MAI = new XCoreMCAsmInfo(TT);
 
   // Initial state of the frame pointer is SP.
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index 23e24f2afd5a..702056d781d0 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -37,7 +37,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -100,7 +100,7 @@ void XCoreAsmPrinter::emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV) {
                           Twine(Sym->getName() + StringRef(".globound")));
     OutStreamer->EmitSymbolAttribute(SymGlob, MCSA_Global);
     OutStreamer->EmitAssignment(SymGlob,
-                                MCConstantExpr::Create(ATy->getNumElements(),
+                                MCConstantExpr::create(ATy->getNumElements(),
                                                        OutContext));
     if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
         GV->hasCommonLinkage()) {
@@ -157,7 +157,8 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   unsigned Size = TD->getTypeAllocSize(C->getType());
   if (MAI->hasDotTypeDotSizeDirective()) {
     OutStreamer->EmitSymbolAttribute(GVSym, MCSA_ELF_TypeObject);
-    OutStreamer->EmitELFSize(GVSym, MCConstantExpr::Create(Size, OutContext));
+    OutStreamer->emitELFSize(cast<MCSymbolELF>(GVSym),
+                             MCConstantExpr::create(Size, OutContext));
   }
   OutStreamer->EmitLabel(GVSym);
   
@@ -201,7 +202,7 @@ printInlineJT(const MachineInstr *MI, int opNum, raw_ostream &O,
     MachineBasicBlock *MBB = JTBBs[i];
     if (i > 0)
       O << ",";
-    O << *MBB->getSymbol();
+    MBB->getSymbol()->print(O, MAI);
   }
 }
 
@@ -217,17 +218,17 @@ void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
     O << MO.getImm();
     break;
   case MachineOperand::MO_MachineBasicBlock:
-    O << *MO.getMBB()->getSymbol();
+    MO.getMBB()->getSymbol()->print(O, MAI);
     break;
   case MachineOperand::MO_GlobalAddress:
-    O << *getSymbol(MO.getGlobal());
+    getSymbol(MO.getGlobal())->print(O, MAI);
     break;
   case MachineOperand::MO_ConstantPoolIndex:
     O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
       << '_' << MO.getIndex();
     break;
   case MachineOperand::MO_BlockAddress:
-    O << *GetBlockAddressSymbol(MO.getBlockAddress());
+    GetBlockAddressSymbol(MO.getBlockAddress())->print(O, MAI);
     break;
   default:
     llvm_unreachable("not implemented");
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index f56caec24d63..aa71241102ff 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -1926,7 +1926,8 @@ static inline bool isImmUs4(int64_t val)
 /// by AM is legal for this target, for a load/store of the specified type.
 bool
 XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              Type *Ty) const {
+                                           Type *Ty,
+                                           unsigned AS) const {
   if (Ty->getTypeID() == Type::VoidTyID)
     return AM.Scale == 0 && isImmUs(AM.BaseOffs) && isImmUs4(AM.BaseOffs);
 
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 22014ed4bac6..97f0494b6fe3 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -120,7 +120,8 @@ namespace llvm {
       EmitInstrWithCustomInserter(MachineInstr *MI,
                                   MachineBasicBlock *MBB) const override;
 
-    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
+                               unsigned AS) const override;
 
   private:
     const TargetMachine &TM;
diff --git a/lib/Target/XCore/XCoreMCInstLower.cpp b/lib/Target/XCore/XCoreMCInstLower.cpp
index cffba5fee03f..03c5fa2e9c42 100644
--- a/lib/Target/XCore/XCoreMCInstLower.cpp
+++ b/lib/Target/XCore/XCoreMCInstLower.cpp
@@ -65,7 +65,7 @@ MCOperand XCoreMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
       llvm_unreachable("<unknown operand type>");
   }
 
-  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::Create(Symbol, Kind, *Ctx);
+  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Symbol, Kind, *Ctx);
 
   if (!Offset)
     return MCOperand::createExpr(MCSym);
@@ -73,8 +73,8 @@ MCOperand XCoreMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   // Assume offset is never negative.
   assert(Offset > 0);
 
-  const MCConstantExpr *OffsetExpr =  MCConstantExpr::Create(Offset, *Ctx);
-  const MCBinaryExpr *Add = MCBinaryExpr::CreateAdd(MCSym, OffsetExpr, *Ctx);
+  const MCConstantExpr *OffsetExpr =  MCConstantExpr::create(Offset, *Ctx);
+  const MCBinaryExpr *Add = MCBinaryExpr::createAdd(MCSym, OffsetExpr, *Ctx);
   return MCOperand::createExpr(Add);
 }