1 files changed, 322 insertions, 98 deletions
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 8d32510e2004..081d4ff033bd 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -97,6 +97,8 @@ public:
     return SelectImmShifterOperand(N, A, B, false);
   }
 
+  bool SelectAddLikeOr(SDNode *Parent, SDValue N, SDValue &Out);
+
   bool SelectAddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
   bool SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc);
 
@@ -118,8 +120,10 @@ public:
                        SDValue &Offset, SDValue &Opc);
   bool SelectAddrMode3Offset(SDNode *Op, SDValue N,
                              SDValue &Offset, SDValue &Opc);
-  bool SelectAddrMode5(SDValue N, SDValue &Base,
-                       SDValue &Offset);
+  bool IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset,
+                         int Lwb, int Upb, bool FP16);
+  bool SelectAddrMode5(SDValue N, SDValue &Base, SDValue &Offset);
+  bool SelectAddrMode5FP16(SDValue N, SDValue &Base, SDValue &Offset);
   bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align);
   bool SelectAddrMode6Offset(SDNode *Op, SDValue N, SDValue &Offset);
 
@@ -199,10 +203,11 @@ private:
 
   /// SelectVLDDup - Select NEON load-duplicate intrinsics.  NumVecs
   /// should be 1, 2, 3 or 4.  The opcode array specifies the instructions used
-  /// for loading D registers.  (Q registers are not supported.)
-  void SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    const uint16_t *DOpcodes,
-                    const uint16_t *QOpcodes = nullptr);
+  /// for loading D registers.
+  void SelectVLDDup(SDNode *N, bool IsIntrinsic, bool isUpdating,
+                    unsigned NumVecs, const uint16_t *DOpcodes,
+                    const uint16_t *QOpcodes0 = nullptr,
+                    const uint16_t *QOpcodes1 = nullptr);
 
   /// Try to select SBFX/UBFX instructions for ARM.
   bool tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
@@ -281,7 +286,7 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
          isInt32Immediate(N->getOperand(1).getNode(), Imm);
 }
 
-/// \brief Check whether a particular node is a constant value representable as
+/// Check whether a particular node is a constant value representable as
 /// (N * Scale) where (N in [\p RangeMin, \p RangeMax).
 ///
 /// \param ScaledConstant [out] - On success, the pre-scaled constant value.
@@ -498,7 +503,7 @@ bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N,
 
 void ARMDAGToDAGISel::replaceDAGValue(const SDValue &N, SDValue M) {
   CurDAG->RepositionNode(N.getNode()->getIterator(), M.getNode());
-  CurDAG->ReplaceAllUsesWith(N, M);
+  ReplaceUses(N, M);
 }
 
 bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
@@ -567,6 +572,14 @@ bool ARMDAGToDAGISel::SelectRegShifterOperand(SDValue N,
   return true;
 }
 
+// Determine whether an ISD::OR's operands are suitable to turn the operation
+// into an addition, which often has more compact encodings.
+bool ARMDAGToDAGISel::SelectAddLikeOr(SDNode *Parent, SDValue N, SDValue &Out) {
+  assert(Parent->getOpcode() == ISD::OR && "unexpected parent");
+  Out = N;
+  return CurDAG->haveNoCommonBitsSet(N, Parent->getOperand(1));
+}
+
 
 bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
                                           SDValue &Base,
@@ -886,8 +899,8 @@ bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N,
   return true;
 }
 
-bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
-                                      SDValue &Base, SDValue &Offset) {
+bool ARMDAGToDAGISel::IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset,
+                                        int Lwb, int Upb, bool FP16) {
   if (!CurDAG->isBaseWithConstantOffset(N)) {
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
@@ -907,8 +920,9 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
 
   // If the RHS is +/- imm8, fold into addr mode.
   int RHSC;
-  if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4,
-                              -256 + 1, 256, RHSC)) {
+  const int Scale = FP16 ? 2 : 4;
+
+  if (isScaledConstantInRange(N.getOperand(1), Scale, Lwb, Upb, RHSC)) {
     Base = N.getOperand(0);
     if (Base.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(Base)->getIndex();
@@ -921,17 +935,43 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
       AddSub = ARM_AM::sub;
       RHSC = -RHSC;
     }
-    Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC),
-                                       SDLoc(N), MVT::i32);
+
+    if (FP16)
+      Offset = CurDAG->getTargetConstant(ARM_AM::getAM5FP16Opc(AddSub, RHSC),
+                                         SDLoc(N), MVT::i32);
+    else
+      Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC),
+                                         SDLoc(N), MVT::i32);
+
     return true;
   }
 
   Base = N;
-  Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
-                                     SDLoc(N), MVT::i32);
+
+  if (FP16)
+    Offset = CurDAG->getTargetConstant(ARM_AM::getAM5FP16Opc(ARM_AM::add, 0),
+                                       SDLoc(N), MVT::i32);
+  else
+    Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
+                                       SDLoc(N), MVT::i32);
+
   return true;
 }
 
+bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
+                                      SDValue &Base, SDValue &Offset) {
+  int Lwb = -256 + 1;
+  int Upb = 256;
+  return IsAddressingMode5(N, Base, Offset, Lwb, Upb, /*FP16=*/ false);
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode5FP16(SDValue N,
+                                          SDValue &Base, SDValue &Offset) {
+  int Lwb = -512 + 1;
+  int Upb = 512;
+  return IsAddressingMode5(N, Base, Offset, Lwb, Upb, /*FP16=*/ true);
+}
+
 bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,
                                       SDValue &Align) {
   Addr = N;
@@ -1467,7 +1507,7 @@ bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) {
   return false;
 }
 
-/// \brief Form a GPRPair pseudo register from a pair of GPR regs.
+/// Form a GPRPair pseudo register from a pair of GPR regs.
 SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) {
   SDLoc dl(V0.getNode());
   SDValue RegClass =
@@ -1478,7 +1518,7 @@ SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) {
   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
 }
 
-/// \brief Form a D register from a pair of S registers.
+/// Form a D register from a pair of S registers.
 SDNode *ARMDAGToDAGISel::createSRegPairNode(EVT VT, SDValue V0, SDValue V1) {
   SDLoc dl(V0.getNode());
   SDValue RegClass =
@@ -1489,7 +1529,7 @@ SDNode *ARMDAGToDAGISel::createSRegPairNode(EVT VT, SDValue V0, SDValue V1) {
   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
 }
 
-/// \brief Form a quad register from a pair of D registers.
+/// Form a quad register from a pair of D registers.
 SDNode *ARMDAGToDAGISel::createDRegPairNode(EVT VT, SDValue V0, SDValue V1) {
   SDLoc dl(V0.getNode());
   SDValue RegClass = CurDAG->getTargetConstant(ARM::QPRRegClassID, dl,
@@ -1500,7 +1540,7 @@ SDNode *ARMDAGToDAGISel::createDRegPairNode(EVT VT, SDValue V0, SDValue V1) {
   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
 }
 
-/// \brief Form 4 consecutive D registers from a pair of Q registers.
+/// Form 4 consecutive D registers from a pair of Q registers.
 SDNode *ARMDAGToDAGISel::createQRegPairNode(EVT VT, SDValue V0, SDValue V1) {
   SDLoc dl(V0.getNode());
   SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, dl,
@@ -1511,7 +1551,7 @@ SDNode *ARMDAGToDAGISel::createQRegPairNode(EVT VT, SDValue V0, SDValue V1) {
   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
 }
 
-/// \brief Form 4 consecutive S registers.
+/// Form 4 consecutive S registers.
 SDNode *ARMDAGToDAGISel::createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1,
                                    SDValue V2, SDValue V3) {
   SDLoc dl(V0.getNode());
@@ -1526,7 +1566,7 @@ SDNode *ARMDAGToDAGISel::createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1,
   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
 }
 
-/// \brief Form 4 consecutive D registers.
+/// Form 4 consecutive D registers.
 SDNode *ARMDAGToDAGISel::createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1,
                                    SDValue V2, SDValue V3) {
   SDLoc dl(V0.getNode());
@@ -1541,7 +1581,7 @@ SDNode *ARMDAGToDAGISel::createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1,
   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops);
 }
 
-/// \brief Form 4 consecutive Q registers.
+/// Form 4 consecutive Q registers.
 SDNode *ARMDAGToDAGISel::createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1,
                                    SDValue V2, SDValue V3) {
   SDLoc dl(V0.getNode());
@@ -1708,7 +1748,9 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
   SDLoc dl(N);
 
   SDValue MemAddr, Align;
-  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  bool IsIntrinsic = !isUpdating;  // By coincidence, all supported updating
+                                   // nodes are not intrinsics.
+  unsigned AddrOpIdx = IsIntrinsic ? 2 : 1;
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
     return;
 
@@ -1732,9 +1774,7 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
   case MVT::v4f32:
   case MVT::v4i32: OpcodeIndex = 2; break;
   case MVT::v2f64:
-  case MVT::v2i64: OpcodeIndex = 3;
-    assert(NumVecs == 1 && "v2i64 type only supported for VLD1");
-    break;
+  case MVT::v2i64: OpcodeIndex = 3; break;
   }
 
   EVT ResTy;
@@ -1765,15 +1805,17 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
     Ops.push_back(Align);
     if (isUpdating) {
       SDValue Inc = N->getOperand(AddrOpIdx + 1);
-      // FIXME: VLD1/VLD2 fixed increment doesn't need Reg0. Remove the reg0
-      // case entirely when the rest are updated to that form, too.
       bool IsImmUpdate = isPerfectIncrement(Inc, VT, NumVecs);
-      if ((NumVecs <= 2) && !IsImmUpdate)
-        Opc = getVLDSTRegisterUpdateOpcode(Opc);
-      // FIXME: We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so
-      // check for that explicitly too. Horribly hacky, but temporary.
-      if ((NumVecs > 2 && !isVLDfixed(Opc)) || !IsImmUpdate)
-        Ops.push_back(IsImmUpdate ? Reg0 : Inc);
+      if (!IsImmUpdate) {
+        // We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so
+        // check for the opcode rather than the number of vector elements.
+        if (isVLDfixed(Opc))
+          Opc = getVLDSTRegisterUpdateOpcode(Opc);
+        Ops.push_back(Inc);
+      // VLD1/VLD2 fixed increment does not need Reg0 so only include it in
+      // the operands if not such an opcode.
+      } else if (!isVLDfixed(Opc))
+        Ops.push_back(Reg0);
     }
     Ops.push_back(Pred);
     Ops.push_back(Reg0);
@@ -1844,7 +1886,9 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
   SDLoc dl(N);
 
   SDValue MemAddr, Align;
-  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  bool IsIntrinsic = !isUpdating;  // By coincidence, all supported updating
+                                   // nodes are not intrinsics.
+  unsigned AddrOpIdx = IsIntrinsic ? 2 : 1;
   unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
     return;
@@ -1862,19 +1906,19 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
   default: llvm_unreachable("unhandled vst type");
     // Double-register operations:
   case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4f16:
   case MVT::v4i16: OpcodeIndex = 1; break;
   case MVT::v2f32:
   case MVT::v2i32: OpcodeIndex = 2; break;
   case MVT::v1i64: OpcodeIndex = 3; break;
     // Quad-register operations:
   case MVT::v16i8: OpcodeIndex = 0; break;
+  case MVT::v8f16:
   case MVT::v8i16: OpcodeIndex = 1; break;
   case MVT::v4f32:
   case MVT::v4i32: OpcodeIndex = 2; break;
   case MVT::v2f64:
-  case MVT::v2i64: OpcodeIndex = 3;
-    assert(NumVecs == 1 && "v2i64 type only supported for VST1");
-    break;
+  case MVT::v2i64: OpcodeIndex = 3; break;
   }
 
   std::vector<EVT> ResTys;
@@ -1919,16 +1963,17 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
     Ops.push_back(Align);
     if (isUpdating) {
       SDValue Inc = N->getOperand(AddrOpIdx + 1);
-      // FIXME: VST1/VST2 fixed increment doesn't need Reg0. Remove the reg0
-      // case entirely when the rest are updated to that form, too.
       bool IsImmUpdate = isPerfectIncrement(Inc, VT, NumVecs);
-      if (NumVecs <= 2 && !IsImmUpdate)
-        Opc = getVLDSTRegisterUpdateOpcode(Opc);
-      // FIXME: We use a VST1 for v1i64 even if the pseudo says vld2/3/4, so
-      // check for that explicitly too. Horribly hacky, but temporary.
-      if  (!IsImmUpdate)
+      if (!IsImmUpdate) {
+        // We use a VST1 for v1i64 even if the pseudo says VST2/3/4, so
+        // check for the opcode rather than the number of vector elements.
+        if (isVSTfixed(Opc))
+          Opc = getVLDSTRegisterUpdateOpcode(Opc);
         Ops.push_back(Inc);
-      else if (NumVecs > 2 && !isVSTfixed(Opc))
+      }
+      // VST1/VST2 fixed increment does not need Reg0 so only include it in
+      // the operands if not such an opcode.
+      else if (!isVSTfixed(Opc))
         Ops.push_back(Reg0);
     }
     Ops.push_back(SrcReg);
@@ -1993,7 +2038,9 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
   SDLoc dl(N);
 
   SDValue MemAddr, Align;
-  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  bool IsIntrinsic = !isUpdating;  // By coincidence, all supported updating
+                                   // nodes are not intrinsics.
+  unsigned AddrOpIdx = IsIntrinsic ? 2 : 1;
   unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
     return;
@@ -2109,21 +2156,22 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
   CurDAG->RemoveDeadNode(N);
 }
 
-void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic,
+                                   bool isUpdating, unsigned NumVecs,
                                    const uint16_t *DOpcodes,
-                                   const uint16_t *QOpcodes) {
+                                   const uint16_t *QOpcodes0,
+                                   const uint16_t *QOpcodes1) {
   assert(NumVecs >= 1 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
   SDLoc dl(N);
 
   SDValue MemAddr, Align;
-  if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align))
+  unsigned AddrOpIdx = IsIntrinsic ? 2 : 1;
+  if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
     return;
 
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-
   SDValue Chain = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  bool is64BitVector = VT.is64BitVector();
 
   unsigned Alignment = 0;
   if (NumVecs != 3) {
@@ -2140,49 +2188,84 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
   }
   Align = CurDAG->getTargetConstant(Alignment, dl, MVT::i32);
 
-  unsigned Opc;
+  unsigned OpcodeIndex;
   switch (VT.getSimpleVT().SimpleTy) {
   default: llvm_unreachable("unhandled vld-dup type");
-  case MVT::v8i8:  Opc = DOpcodes[0]; break;
-  case MVT::v16i8: Opc = QOpcodes[0]; break;
-  case MVT::v4i16: Opc = DOpcodes[1]; break;
-  case MVT::v8i16: Opc = QOpcodes[1]; break;
+  case MVT::v8i8:
+  case MVT::v16i8: OpcodeIndex = 0; break;
+  case MVT::v4i16:
+  case MVT::v8i16: OpcodeIndex = 1; break;
   case MVT::v2f32:
-  case MVT::v2i32: Opc = DOpcodes[2]; break;
+  case MVT::v2i32:
   case MVT::v4f32:
-  case MVT::v4i32: Opc = QOpcodes[2]; break;
-  }
-
-  SDValue Pred = getAL(CurDAG, dl);
-  SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(MemAddr);
-  Ops.push_back(Align);
-  if (isUpdating) {
-    // fixed-stride update instructions don't have an explicit writeback
-    // operand. It's implicit in the opcode itself.
-    SDValue Inc = N->getOperand(2);
-    bool IsImmUpdate =
-        isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs);
-    if (NumVecs <= 2 && !IsImmUpdate)
-      Opc = getVLDSTRegisterUpdateOpcode(Opc);
-    if (!IsImmUpdate)
-      Ops.push_back(Inc);
-    // FIXME: VLD3 and VLD4 haven't been updated to that form yet.
-    else if (NumVecs > 2)
-      Ops.push_back(Reg0);
+  case MVT::v4i32: OpcodeIndex = 2; break;
+  case MVT::v1f64:
+  case MVT::v1i64: OpcodeIndex = 3; break;
   }
-  Ops.push_back(Pred);
-  Ops.push_back(Reg0);
-  Ops.push_back(Chain);
 
   unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
+  if (!is64BitVector)
+    ResTyElts *= 2;
+  EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts);
+
   std::vector<EVT> ResTys;
-  ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,ResTyElts));
+  ResTys.push_back(ResTy);
   if (isUpdating)
     ResTys.push_back(MVT::i32);
   ResTys.push_back(MVT::Other);
-  SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  SDValue Pred = getAL(CurDAG, dl);
+  SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+
+  SDNode *VLdDup;
+  if (is64BitVector || NumVecs == 1) {
+    SmallVector<SDValue, 6> Ops;
+    Ops.push_back(MemAddr);
+    Ops.push_back(Align);
+    unsigned Opc = is64BitVector ? DOpcodes[OpcodeIndex] :
+                                   QOpcodes0[OpcodeIndex];
+    if (isUpdating) {
+      // fixed-stride update instructions don't have an explicit writeback
+      // operand. It's implicit in the opcode itself.
+      SDValue Inc = N->getOperand(2);
+      bool IsImmUpdate =
+          isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs);
+      if (NumVecs <= 2 && !IsImmUpdate)
+        Opc = getVLDSTRegisterUpdateOpcode(Opc);
+      if (!IsImmUpdate)
+        Ops.push_back(Inc);
+      // FIXME: VLD3 and VLD4 haven't been updated to that form yet.
+      else if (NumVecs > 2)
+        Ops.push_back(Reg0);
+    }
+    Ops.push_back(Pred);
+    Ops.push_back(Reg0);
+    Ops.push_back(Chain);
+    VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  } else if (NumVecs == 2) {
+    const SDValue OpsA[] = { MemAddr, Align, Pred, Reg0, Chain };
+    SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex],
+                                          dl, ResTys, OpsA);
+
+    Chain = SDValue(VLdA, 1);
+    const SDValue OpsB[] = { MemAddr, Align, Pred, Reg0, Chain };
+    VLdDup = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, OpsB);
+  } else {
+    SDValue ImplDef =
+      SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0);
+    const SDValue OpsA[] = { MemAddr, Align, ImplDef, Pred, Reg0, Chain };
+    SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex],
+                                          dl, ResTys, OpsA);
+
+    SDValue SuperReg = SDValue(VLdA, 0);
+    Chain = SDValue(VLdA, 1);
+    const SDValue OpsB[] = { MemAddr, Align, SuperReg, Pred, Reg0, Chain };
+    VLdDup = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys, OpsB);
+  }
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
 
   // Extract the subregisters.
@@ -2191,10 +2274,11 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
   } else {
     SDValue SuperReg = SDValue(VLdDup, 0);
     static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering");
-    unsigned SubIdx = ARM::dsub_0;
-    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    unsigned SubIdx = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
+    for (unsigned Vec = 0; Vec != NumVecs; ++Vec) {
       ReplaceUses(SDValue(N, Vec),
                   CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
+    }
   }
   ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
   if (isUpdating)
@@ -2253,6 +2337,7 @@ bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
           return true;
         }
 
+        assert(LSB + Width + 1 <= 32 && "Shouldn't create an invalid ubfx");
         SDValue Ops[] = { N->getOperand(0).getOperand(0),
                           CurDAG->getTargetConstant(LSB, dl, MVT::i32),
                           CurDAG->getTargetConstant(Width, dl, MVT::i32),
@@ -2277,6 +2362,7 @@ bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
       if (LSB < 0)
         return false;
       SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+      assert(LSB + Width + 1 <= 32 && "Shouldn't create an invalid ubfx");
       SDValue Ops[] = { N->getOperand(0).getOperand(0),
                         CurDAG->getTargetConstant(LSB, dl, MVT::i32),
                         CurDAG->getTargetConstant(Width, dl, MVT::i32),
@@ -2298,6 +2384,7 @@ bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
       // Note: The width operand is encoded as width-1.
       unsigned Width = MSB - LSB;
       SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+      assert(Srl_imm + Width + 1 <= 32 && "Shouldn't create an invalid ubfx");
       SDValue Ops[] = { N->getOperand(0).getOperand(0),
                         CurDAG->getTargetConstant(Srl_imm, dl, MVT::i32),
                         CurDAG->getTargetConstant(Width, dl, MVT::i32),
@@ -2318,6 +2405,7 @@ bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
       return false;
 
     SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+    assert(LSB + Width <= 32 && "Shouldn't create an invalid ubfx");
     SDValue Ops[] = { N->getOperand(0).getOperand(0),
                       CurDAG->getTargetConstant(LSB, dl, MVT::i32),
                       CurDAG->getTargetConstant(Width - 1, dl, MVT::i32),
@@ -2427,7 +2515,7 @@ void ARMDAGToDAGISel::SelectCMPZ(SDNode *N, bool &SwitchEQNEToPLMI) {
   SDValue X = And.getOperand(0);
   auto C = dyn_cast<ConstantSDNode>(And.getOperand(1));
 
-  if (!C || !X->hasOneUse())
+  if (!C)
     return;
   auto Range = getContiguousRangeOfSetBits(C->getAPIntValue());
   if (!Range)
@@ -2765,7 +2853,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     }
   }
   case ARMISD::SUBE: {
-    if (!Subtarget->hasV6Ops())
+    if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
       break;
     // Look for a pattern to match SMMLS
     // (sube a, (smul_loHi a, b), (subc 0, (smul_LOhi(a, b))))
@@ -3026,14 +3114,14 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VLD1DUPd32 };
     static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8, ARM::VLD1DUPq16,
                                          ARM::VLD1DUPq32 };
-    SelectVLDDup(N, false, 1, DOpcodes, QOpcodes);
+    SelectVLDDup(N, /* IsIntrinsic= */ false, false, 1, DOpcodes, QOpcodes);
     return;
   }
 
   case ARMISD::VLD2DUP: {
     static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
                                         ARM::VLD2DUPd32 };
-    SelectVLDDup(N, false, 2, Opcodes);
+    SelectVLDDup(N, /* IsIntrinsic= */ false, false, 2, Opcodes);
     return;
   }
 
@@ -3041,7 +3129,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo,
                                         ARM::VLD3DUPd16Pseudo,
                                         ARM::VLD3DUPd32Pseudo };
-    SelectVLDDup(N, false, 3, Opcodes);
+    SelectVLDDup(N, /* IsIntrinsic= */ false, false, 3, Opcodes);
     return;
   }
 
@@ -3049,7 +3137,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo,
                                         ARM::VLD4DUPd16Pseudo,
                                         ARM::VLD4DUPd32Pseudo };
-    SelectVLDDup(N, false, 4, Opcodes);
+    SelectVLDDup(N, /* IsIntrinsic= */ false, false, 4, Opcodes);
     return;
   }
 
@@ -3060,7 +3148,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8wb_fixed,
                                          ARM::VLD1DUPq16wb_fixed,
                                          ARM::VLD1DUPq32wb_fixed };
-    SelectVLDDup(N, true, 1, DOpcodes, QOpcodes);
+    SelectVLDDup(N, /* IsIntrinsic= */ false, true, 1, DOpcodes, QOpcodes);
     return;
   }
 
@@ -3068,7 +3156,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed,
                                         ARM::VLD2DUPd16wb_fixed,
                                         ARM::VLD2DUPd32wb_fixed };
-    SelectVLDDup(N, true, 2, Opcodes);
+    SelectVLDDup(N, /* IsIntrinsic= */ false, true, 2, Opcodes);
     return;
   }
 
@@ -3076,7 +3164,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD,
                                         ARM::VLD3DUPd16Pseudo_UPD,
                                         ARM::VLD3DUPd32Pseudo_UPD };
-    SelectVLDDup(N, true, 3, Opcodes);
+    SelectVLDDup(N, /* IsIntrinsic= */ false, true, 3, Opcodes);
     return;
   }
 
@@ -3084,7 +3172,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD,
                                         ARM::VLD4DUPd16Pseudo_UPD,
                                         ARM::VLD4DUPd32Pseudo_UPD };
-    SelectVLDDup(N, true, 4, Opcodes);
+    SelectVLDDup(N, /* IsIntrinsic= */ false, true, 4, Opcodes);
     return;
   }
 
@@ -3407,6 +3495,51 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
       return;
     }
 
+    case Intrinsic::arm_neon_vld1x2: {
+      static const uint16_t DOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16,
+                                           ARM::VLD1q32, ARM::VLD1q64 };
+      static const uint16_t QOpcodes[] = { ARM::VLD1d8QPseudo,
+                                           ARM::VLD1d16QPseudo,
+                                           ARM::VLD1d32QPseudo,
+                                           ARM::VLD1d64QPseudo };
+      SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vld1x3: {
+      static const uint16_t DOpcodes[] = { ARM::VLD1d8TPseudo,
+                                           ARM::VLD1d16TPseudo,
+                                           ARM::VLD1d32TPseudo,
+                                           ARM::VLD1d64TPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VLD1q8LowTPseudo_UPD,
+                                            ARM::VLD1q16LowTPseudo_UPD,
+                                            ARM::VLD1q32LowTPseudo_UPD,
+                                            ARM::VLD1q64LowTPseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VLD1q8HighTPseudo,
+                                            ARM::VLD1q16HighTPseudo,
+                                            ARM::VLD1q32HighTPseudo,
+                                            ARM::VLD1q64HighTPseudo };
+      SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vld1x4: {
+      static const uint16_t DOpcodes[] = { ARM::VLD1d8QPseudo,
+                                           ARM::VLD1d16QPseudo,
+                                           ARM::VLD1d32QPseudo,
+                                           ARM::VLD1d64QPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VLD1q8LowQPseudo_UPD,
+                                            ARM::VLD1q16LowQPseudo_UPD,
+                                            ARM::VLD1q32LowQPseudo_UPD,
+                                            ARM::VLD1q64LowQPseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VLD1q8HighQPseudo,
+                                            ARM::VLD1q16HighQPseudo,
+                                            ARM::VLD1q32HighQPseudo,
+                                            ARM::VLD1q64HighQPseudo };
+      SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
+    }
+
     case Intrinsic::arm_neon_vld2: {
       static const uint16_t DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16,
                                            ARM::VLD2d32, ARM::VLD1q64 };
@@ -3446,6 +3579,52 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
       return;
     }
 
+    case Intrinsic::arm_neon_vld2dup: {
+      static const uint16_t DOpcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
+                                           ARM::VLD2DUPd32, ARM::VLD1q64 };
+      static const uint16_t QOpcodes0[] = { ARM::VLD2DUPq8EvenPseudo,
+                                            ARM::VLD2DUPq16EvenPseudo,
+                                            ARM::VLD2DUPq32EvenPseudo };
+      static const uint16_t QOpcodes1[] = { ARM::VLD2DUPq8OddPseudo,
+                                            ARM::VLD2DUPq16OddPseudo,
+                                            ARM::VLD2DUPq32OddPseudo };
+      SelectVLDDup(N, /* IsIntrinsic= */ true, false, 2,
+                   DOpcodes, QOpcodes0, QOpcodes1);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vld3dup: {
+      static const uint16_t DOpcodes[] = { ARM::VLD3DUPd8Pseudo,
+                                           ARM::VLD3DUPd16Pseudo,
+                                           ARM::VLD3DUPd32Pseudo,
+                                           ARM::VLD1d64TPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VLD3DUPq8EvenPseudo,
+                                            ARM::VLD3DUPq16EvenPseudo,
+                                            ARM::VLD3DUPq32EvenPseudo };
+      static const uint16_t QOpcodes1[] = { ARM::VLD3DUPq8OddPseudo,
+                                            ARM::VLD3DUPq16OddPseudo,
+                                            ARM::VLD3DUPq32OddPseudo };
+      SelectVLDDup(N, /* IsIntrinsic= */ true, false, 3,
+                   DOpcodes, QOpcodes0, QOpcodes1);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vld4dup: {
+      static const uint16_t DOpcodes[] = { ARM::VLD4DUPd8Pseudo,
+                                           ARM::VLD4DUPd16Pseudo,
+                                           ARM::VLD4DUPd32Pseudo,
+                                           ARM::VLD1d64QPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VLD4DUPq8EvenPseudo,
+                                            ARM::VLD4DUPq16EvenPseudo,
+                                            ARM::VLD4DUPq32EvenPseudo };
+      static const uint16_t QOpcodes1[] = { ARM::VLD4DUPq8OddPseudo,
+                                            ARM::VLD4DUPq16OddPseudo,
+                                            ARM::VLD4DUPq32OddPseudo };
+      SelectVLDDup(N, /* IsIntrinsic= */ true, false, 4,
+                   DOpcodes, QOpcodes0, QOpcodes1);
+      return;
+    }
+
     case Intrinsic::arm_neon_vld2lane: {
       static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo,
                                            ARM::VLD2LNd16Pseudo,
@@ -3485,6 +3664,51 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
       return;
     }
 
+    case Intrinsic::arm_neon_vst1x2: {
+      static const uint16_t DOpcodes[] = { ARM::VST1q8, ARM::VST1q16,
+                                           ARM::VST1q32, ARM::VST1q64 };
+      static const uint16_t QOpcodes[] = { ARM::VST1d8QPseudo,
+                                           ARM::VST1d16QPseudo,
+                                           ARM::VST1d32QPseudo,
+                                           ARM::VST1d64QPseudo };
+      SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vst1x3: {
+      static const uint16_t DOpcodes[] = { ARM::VST1d8TPseudo,
+                                           ARM::VST1d16TPseudo,
+                                           ARM::VST1d32TPseudo,
+                                           ARM::VST1d64TPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VST1q8LowTPseudo_UPD,
+                                            ARM::VST1q16LowTPseudo_UPD,
+                                            ARM::VST1q32LowTPseudo_UPD,
+                                            ARM::VST1q64LowTPseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VST1q8HighTPseudo,
+                                            ARM::VST1q16HighTPseudo,
+                                            ARM::VST1q32HighTPseudo,
+                                            ARM::VST1q64HighTPseudo };
+      SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
+    }
+
+    case Intrinsic::arm_neon_vst1x4: {
+      static const uint16_t DOpcodes[] = { ARM::VST1d8QPseudo,
+                                           ARM::VST1d16QPseudo,
+                                           ARM::VST1d32QPseudo,
+                                           ARM::VST1d64QPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VST1q8LowQPseudo_UPD,
+                                            ARM::VST1q16LowQPseudo_UPD,
+                                            ARM::VST1q32LowQPseudo_UPD,
+                                            ARM::VST1q64LowQPseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VST1q8HighQPseudo,
+                                            ARM::VST1q16HighQPseudo,
+                                            ARM::VST1q32HighQPseudo,
+                                            ARM::VST1q64HighQPseudo };
+      SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
+    }
+
     case Intrinsic::arm_neon_vst2: {
       static const uint16_t DOpcodes[] = { ARM::VST2d8, ARM::VST2d16,
                                            ARM::VST2d32, ARM::VST1q64 };