From 66e41e3c6e8b8fbc48d5d3b4d2bd9ce0be4ecb75 Mon Sep 17 00:00:00 2001
From: Roman Divacky <rdivacky@FreeBSD.org>
Date: Tue, 13 Jul 2010 17:19:57 +0000
Subject: Update LLVM to r108243.

---
 lib/Target/ARM/ARM.h                               |   4 -
 lib/Target/ARM/ARMAddressingModes.h                |  64 ++
 lib/Target/ARM/ARMBaseInstrInfo.cpp                | 629 +++++++--------
 lib/Target/ARM/ARMBaseInstrInfo.h                  |  87 +-
 lib/Target/ARM/ARMBaseRegisterInfo.cpp             | 191 +++--
 lib/Target/ARM/ARMBaseRegisterInfo.h               |  24 +-
 lib/Target/ARM/ARMCodeEmitter.cpp                  | 174 +++-
 lib/Target/ARM/ARMConstantIslandPass.cpp           |  58 +-
 lib/Target/ARM/ARMConstantPoolValue.h              |   1 +
 lib/Target/ARM/ARMExpandPseudoInsts.cpp            |  10 +-
 lib/Target/ARM/ARMISelDAGToDAG.cpp                 | 551 ++++++-------
 lib/Target/ARM/ARMISelLowering.cpp                 | 893 +++++++++++++++------
 lib/Target/ARM/ARMISelLowering.h                   |  47 +-
 lib/Target/ARM/ARMInstrFormats.td                  |  44 +-
 lib/Target/ARM/ARMInstrInfo.cpp                    |   2 +-
 lib/Target/ARM/ARMInstrInfo.h                      |   2 +-
 lib/Target/ARM/ARMInstrInfo.td                     | 128 ++-
 lib/Target/ARM/ARMInstrNEON.td                     |  99 +--
 lib/Target/ARM/ARMInstrThumb.td                    |  23 +-
 lib/Target/ARM/ARMInstrThumb2.td                   |  63 +-
 lib/Target/ARM/ARMInstrVFP.td                      |   8 +-
 lib/Target/ARM/ARMJITInfo.h                        |   3 +-
 lib/Target/ARM/ARMLoadStoreOptimizer.cpp           | 194 +++--
 lib/Target/ARM/ARMMachineFunctionInfo.h            |  12 +-
 lib/Target/ARM/ARMRegisterInfo.td                  |  87 +-
 lib/Target/ARM/ARMScheduleA8.td                    |  84 +-
 lib/Target/ARM/ARMScheduleA9.td                    | 364 +++++----
 lib/Target/ARM/ARMScheduleV6.td                    |   2 +-
 lib/Target/ARM/ARMTargetMachine.cpp                |  30 +-
 lib/Target/ARM/AsmParser/ARMAsmParser.cpp          |  23 +-
 lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp        |  59 +-
 lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp       |  24 +-
 lib/Target/ARM/AsmPrinter/ARMInstPrinter.h         |   5 +-
 lib/Target/ARM/CMakeLists.txt                      |   1 +
 .../ARM/Disassembler/ARMDisassemblerCore.cpp       | 113 +--
 lib/Target/ARM/Disassembler/ARMDisassemblerCore.h  |   8 +-
 .../ARM/Disassembler/ThumbDisassemblerCore.h       |  42 +-
 lib/Target/ARM/NEONMoveFix.cpp                     |   4 +-
 lib/Target/ARM/NEONPreAllocPass.cpp                |  51 +-
 lib/Target/ARM/Thumb1InstrInfo.cpp                 | 125 +--
 lib/Target/ARM/Thumb1InstrInfo.h                   |  24 +-
 lib/Target/ARM/Thumb1RegisterInfo.cpp              |  17 +-
 lib/Target/ARM/Thumb1RegisterInfo.h                |   6 +-
 lib/Target/ARM/Thumb2HazardRecognizer.cpp          |  53 ++
 lib/Target/ARM/Thumb2HazardRecognizer.h            |  40 +
 lib/Target/ARM/Thumb2ITBlockPass.cpp               | 160 +++-
 lib/Target/ARM/Thumb2InstrInfo.cpp                 | 192 ++++-
 lib/Target/ARM/Thumb2InstrInfo.h                   |  39 +-
 lib/Target/ARM/Thumb2SizeReduction.cpp             |  16 +-
 49 files changed, 2977 insertions(+), 1903 deletions(-)
 create mode 100644 lib/Target/ARM/Thumb2HazardRecognizer.cpp
 create mode 100644 lib/Target/ARM/Thumb2HazardRecognizer.h

(limited to 'lib/Target/ARM')

diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index ae7ae59c9262..14825a785649 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -90,10 +90,6 @@ inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
   }
 }
 
-/// ModelWithRegSequence - Return true if isel should use REG_SEQUENCE to model
-/// operations involving sub-registers.
-bool ModelWithRegSequence();
-
 FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
 
diff --git a/lib/Target/ARM/ARMAddressingModes.h b/lib/Target/ARM/ARMAddressingModes.h
index e68354a42f8d..d316b13e0488 100644
--- a/lib/Target/ARM/ARMAddressingModes.h
+++ b/lib/Target/ARM/ARMAddressingModes.h
@@ -520,6 +520,70 @@ namespace ARM_AM {
   // This is stored in two operands [regaddr, align].  The first is the
   // address register.  The second operand is the value of the alignment
   // specifier to use or zero if no explicit alignment.
+  // Valid alignments are: 0, 8, 16, and 32 bytes, depending on the specific
+  // instruction.
+
+  //===--------------------------------------------------------------------===//
+  // NEON Modified Immediates
+  //===--------------------------------------------------------------------===//
+  //
+  // Several NEON instructions (e.g., VMOV) take a "modified immediate"
+  // vector operand, where a small immediate encoded in the instruction
+  // specifies a full NEON vector value.  These modified immediates are
+  // represented here as encoded integers.  The low 8 bits hold the immediate
+  // value; bit 12 holds the "Op" field of the instruction, and bits 11-8 hold
+  // the "Cmode" field of the instruction.  The interfaces below treat the
+  // Op and Cmode values as a single 5-bit value.
+
+  static inline unsigned createNEONModImm(unsigned OpCmode, unsigned Val) {
+    return (OpCmode << 8) | Val;
+  }
+  static inline unsigned getNEONModImmOpCmode(unsigned ModImm) {
+    return (ModImm >> 8) & 0x1f;
+  }
+  static inline unsigned getNEONModImmVal(unsigned ModImm) {
+    return ModImm & 0xff;
+  }
+
+  /// decodeNEONModImm - Decode a NEON modified immediate value into the
+  /// element value and the element size in bits.  (If the element size is
+  /// smaller than the vector, it is splatted into all the elements.)
+  static inline uint64_t decodeNEONModImm(unsigned ModImm, unsigned &EltBits) {
+    unsigned OpCmode = getNEONModImmOpCmode(ModImm);
+    unsigned Imm8 = getNEONModImmVal(ModImm);
+    uint64_t Val = 0;
+
+    if (OpCmode == 0xe) {
+      // 8-bit vector elements
+      Val = Imm8;
+      EltBits = 8;
+    } else if ((OpCmode & 0xc) == 0x8) {
+      // 16-bit vector elements
+      unsigned ByteNum = (OpCmode & 0x6) >> 1;
+      Val = Imm8 << (8 * ByteNum);
+      EltBits = 16;
+    } else if ((OpCmode & 0x8) == 0) {
+      // 32-bit vector elements, zero with one byte set
+      unsigned ByteNum = (OpCmode & 0x6) >> 1;
+      Val = Imm8 << (8 * ByteNum);
+      EltBits = 32;
+    } else if ((OpCmode & 0xe) == 0xc) {
+      // 32-bit vector elements, one byte with low bits set
+      unsigned ByteNum = 1 + (OpCmode & 0x1);
+      Val = (Imm8 << (8 * ByteNum)) | (0xffff >> (8 * (2 - ByteNum)));
+      EltBits = 32;
+    } else if (OpCmode == 0x1e) {
+      // 64-bit vector elements
+      for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
+        if ((ModImm >> ByteNum) & 1)
+          Val |= (uint64_t)0xff << (8 * ByteNum);
+      }
+      EltBits = 64;
+    } else {
+      assert(false && "Unsupported NEON immediate");
+    }
+    return Val;
+  }
 
 } // end namespace ARM_AM
 } // end namespace llvm
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 2528854133e5..49c16f3e0720 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -56,7 +56,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
   MachineInstr *MI = MBBI;
   MachineFunction &MF = *MI->getParent()->getParent();
-  unsigned TSFlags = MI->getDesc().TSFlags;
+  uint64_t TSFlags = MI->getDesc().TSFlags;
   bool isPre = false;
   switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) {
   default: return NULL;
@@ -199,9 +199,9 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
 bool
 ARMBaseInstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                            MachineBasicBlock::iterator MI,
-                                            const std::vector<CalleeSavedInfo> &CSI,
-                                            const TargetRegisterInfo *TRI) const {
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
   if (CSI.empty())
     return false;
 
@@ -227,8 +227,9 @@ ARMBaseInstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
 
     // Insert the spill to the stack frame. The register is killed at the spill
     // 
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
     storeRegToStackSlot(MBB, MI, Reg, isKill,
-                        CSI[i].getFrameIdx(), CSI[i].getRegClass(), TRI);
+                        CSI[i].getFrameIdx(), RC, TRI);
   }
   return true;
 }
@@ -347,10 +348,8 @@ unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 unsigned
 ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                                MachineBasicBlock *FBB,
-                             const SmallVectorImpl<MachineOperand> &Cond) const {
-  // FIXME this should probably have a DebugLoc argument
-  DebugLoc dl;
-
+                               const SmallVectorImpl<MachineOperand> &Cond,
+                               DebugLoc DL) const {
   ARMFunctionInfo *AFI = MBB.getParent()->getInfo<ARMFunctionInfo>();
   int BOpc   = !AFI->isThumbFunction()
     ? ARM::B : (AFI->isThumb2Function() ? ARM::t2B : ARM::tB);
@@ -364,17 +363,17 @@ ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
 
   if (FBB == 0) {
     if (Cond.empty()) // Unconditional branch?
-      BuildMI(&MBB, dl, get(BOpc)).addMBB(TBB);
+      BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
     else
-      BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB)
+      BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
         .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
     return 1;
   }
 
   // Two-way conditional branch.
-  BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB)
+  BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
     .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
-  BuildMI(&MBB, dl, get(BOpc)).addMBB(FBB);
+  BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
   return 2;
 }
 
@@ -487,7 +486,7 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
 
   // Basic size info comes from the TSFlags field.
   const TargetInstrDesc &TID = MI->getDesc();
-  unsigned TSFlags = TID.TSFlags;
+  uint64_t TSFlags = TID.TSFlags;
 
   unsigned Opc = MI->getOpcode();
   switch ((TSFlags & ARMII::SizeMask) >> ARMII::SizeShift) {
@@ -524,11 +523,11 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
       return 10;
     case ARM::Int_eh_sjlj_setjmp:
     case ARM::Int_eh_sjlj_setjmp_nofp:
-      return 24;
+      return 20;
     case ARM::tInt_eh_sjlj_setjmp:
     case ARM::t2Int_eh_sjlj_setjmp:
     case ARM::t2Int_eh_sjlj_setjmp_nofp:
-      return 14;
+      return 12;
     case ARM::BR_JTr:
     case ARM::BR_JTm:
     case ARM::BR_JTadd:
@@ -595,6 +594,7 @@ ARMBaseInstrInfo::isMoveInstr(const MachineInstr &MI,
     return true;
   }
   case ARM::MOVr:
+  case ARM::MOVr_TC:
   case ARM::tMOVr:
   case ARM::tMOVgpr2tgpr:
   case ARM::tMOVtgpr2gpr:
@@ -693,75 +693,44 @@ ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   return 0;
 }
 
-bool
-ARMBaseInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator I,
-                               unsigned DestReg, unsigned SrcReg,
-                               const TargetRegisterClass *DestRC,
-                               const TargetRegisterClass *SrcRC,
-                               DebugLoc DL) const {
-  // tGPR is used sometimes in ARM instructions that need to avoid using
-  // certain registers.  Just treat it as GPR here.
-  if (DestRC == ARM::tGPRRegisterClass)
-    DestRC = ARM::GPRRegisterClass;
-  if (SrcRC == ARM::tGPRRegisterClass)
-    SrcRC = ARM::GPRRegisterClass;
-
-  // Allow DPR / DPR_VFP2 / DPR_8 cross-class copies.
-  if (DestRC == ARM::DPR_8RegisterClass)
-    DestRC = ARM::DPR_VFP2RegisterClass;
-  if (SrcRC == ARM::DPR_8RegisterClass)
-    SrcRC = ARM::DPR_VFP2RegisterClass;
-
-  // Allow QPR / QPR_VFP2 / QPR_8 cross-class copies.
-  if (DestRC == ARM::QPR_VFP2RegisterClass ||
-      DestRC == ARM::QPR_8RegisterClass)
-    DestRC = ARM::QPRRegisterClass;
-  if (SrcRC == ARM::QPR_VFP2RegisterClass ||
-      SrcRC == ARM::QPR_8RegisterClass)
-    SrcRC = ARM::QPRRegisterClass;
-
-  // Allow QQPR / QQPR_VFP2 cross-class copies.
-  if (DestRC == ARM::QQPR_VFP2RegisterClass)
-    DestRC = ARM::QQPRRegisterClass;
-  if (SrcRC == ARM::QQPR_VFP2RegisterClass)
-    SrcRC = ARM::QQPRRegisterClass;
-
-  // Disallow copies of unequal sizes.
-  if (DestRC != SrcRC && DestRC->getSize() != SrcRC->getSize())
-    return false;
-
-  if (DestRC == ARM::GPRRegisterClass) {
-    if (SrcRC == ARM::SPRRegisterClass)
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VMOVRS), DestReg)
-                     .addReg(SrcReg));
-    else
-      AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr),
-                                          DestReg).addReg(SrcReg)));
-  } else {
-    unsigned Opc;
-
-    if (DestRC == ARM::SPRRegisterClass)
-      Opc = (SrcRC == ARM::GPRRegisterClass ? ARM::VMOVSR : ARM::VMOVS);
-    else if (DestRC == ARM::DPRRegisterClass)
-      Opc = ARM::VMOVD;
-    else if (DestRC == ARM::DPR_VFP2RegisterClass ||
-             SrcRC == ARM::DPR_VFP2RegisterClass)
-      // Always use neon reg-reg move if source or dest is NEON-only regclass.
-      Opc = ARM::VMOVDneon;
-    else if (DestRC == ARM::QPRRegisterClass)
-      Opc = ARM::VMOVQ;
-    else if (DestRC == ARM::QQPRRegisterClass)
-      Opc = ARM::VMOVQQ;
-    else if (DestRC == ARM::QQQQPRRegisterClass)
-      Opc = ARM::VMOVQQQQ;
-    else
-      return false;
-
-    AddDefaultPred(BuildMI(MBB, I, DL, get(Opc), DestReg).addReg(SrcReg));
+void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I, DebugLoc DL,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   bool KillSrc) const {
+  bool GPRDest = ARM::GPRRegClass.contains(DestReg);
+  bool GPRSrc  = ARM::GPRRegClass.contains(SrcReg);
+
+  if (GPRDest && GPRSrc) {
+    AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg)
+                                  .addReg(SrcReg, getKillRegState(KillSrc))));
+    return;
   }
 
-  return true;
+  bool SPRDest = ARM::SPRRegClass.contains(DestReg);
+  bool SPRSrc  = ARM::SPRRegClass.contains(SrcReg);
+
+  unsigned Opc;
+  if (SPRDest && SPRSrc)
+    Opc = ARM::VMOVS;
+  else if (GPRDest && SPRSrc)
+    Opc = ARM::VMOVRS;
+  else if (SPRDest && GPRSrc)
+    Opc = ARM::VMOVSR;
+  else if (ARM::DPRRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVD;
+  else if (ARM::QPRRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVQ;
+  else if (ARM::QQPRRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVQQ;
+  else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVQQQQ;
+  else
+    llvm_unreachable("Impossible reg-to-reg copy");
+
+  MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc), DestReg);
+  MIB.addReg(SrcReg, getKillRegState(KillSrc));
+  if (Opc != ARM::VMOVQQ && Opc != ARM::VMOVQQQQ)
+    AddDefaultPred(MIB);
 }
 
 static const
@@ -795,30 +764,34 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
   // tGPR is used sometimes in ARM instructions that need to avoid using
   // certain registers.  Just treat it as GPR here.
-  if (RC == ARM::tGPRRegisterClass)
+  if (RC == ARM::tGPRRegisterClass || RC == ARM::tcGPRRegisterClass)
     RC = ARM::GPRRegisterClass;
 
-  if (RC == ARM::GPRRegisterClass) {
+  switch (RC->getID()) {
+  case ARM::GPRRegClassID:
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STR))
                    .addReg(SrcReg, getKillRegState(isKill))
                    .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO));
-  } else if (RC == ARM::SPRRegisterClass) {
+    break;
+  case ARM::SPRRegClassID:
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRS))
                    .addReg(SrcReg, getKillRegState(isKill))
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
-  } else if (RC == ARM::DPRRegisterClass ||
-             RC == ARM::DPR_VFP2RegisterClass ||
-             RC == ARM::DPR_8RegisterClass) {
+    break;
+  case ARM::DPRRegClassID:
+  case ARM::DPR_VFP2RegClassID:
+  case ARM::DPR_8RegClassID:
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRD))
                    .addReg(SrcReg, getKillRegState(isKill))
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
-  } else if (RC == ARM::QPRRegisterClass ||
-             RC == ARM::QPR_VFP2RegisterClass ||
-             RC == ARM::QPR_8RegisterClass) {
+    break;
+  case ARM::QPRRegClassID:
+  case ARM::QPR_VFP2RegClassID:
+  case ARM::QPR_8RegClassID:
     // FIXME: Neon instructions should support predicates
     if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
       AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q))
-                     .addFrameIndex(FI).addImm(128)
+                     .addFrameIndex(FI).addImm(16)
                      .addReg(SrcReg, getKillRegState(isKill))
                      .addMemOperand(MMO));
     } else {
@@ -828,12 +801,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))
                      .addMemOperand(MMO));
     }
-  } else if (RC == ARM::QQPRRegisterClass || RC == ARM::QQPR_VFP2RegisterClass){
+    break;
+  case ARM::QQPRRegClassID:
+  case ARM::QQPR_VFP2RegClassID:
     if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
       // FIXME: It's possible to only store part of the QQ register if the
       // spilled def has a sub-register index.
-      MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VST2q32))
-        .addFrameIndex(FI).addImm(128);
+      MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VST1d64Q))
+        .addFrameIndex(FI).addImm(16);
       MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
       MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
       MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
@@ -850,8 +825,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
             AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
     }
-  } else {
-    assert(RC == ARM::QQQQPRRegisterClass && "Unknown regclass!");
+    break;
+  case ARM::QQQQPRRegClassID: {
     MachineInstrBuilder MIB =
       AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMD))
                      .addFrameIndex(FI)
@@ -865,6 +840,10 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0, TRI);
     MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0, TRI);
           AddDReg(MIB, SrcReg, ARM::dsub_7, 0, TRI);
+    break;
+  }
+  default:
+    llvm_unreachable("Unknown regclass!");
   }
 }
 
@@ -886,26 +865,30 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
   // tGPR is used sometimes in ARM instructions that need to avoid using
   // certain registers.  Just treat it as GPR here.
-  if (RC == ARM::tGPRRegisterClass)
+  if (RC == ARM::tGPRRegisterClass || RC == ARM::tcGPRRegisterClass)
     RC = ARM::GPRRegisterClass;
 
-  if (RC == ARM::GPRRegisterClass) {
+  switch (RC->getID()) {
+  case ARM::GPRRegClassID:
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDR), DestReg)
                    .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO));
-  } else if (RC == ARM::SPRRegisterClass) {
+    break;
+  case ARM::SPRRegClassID:
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg)
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
-  } else if (RC == ARM::DPRRegisterClass ||
-             RC == ARM::DPR_VFP2RegisterClass ||
-             RC == ARM::DPR_8RegisterClass) {
+    break;
+  case ARM::DPRRegClassID:
+  case ARM::DPR_VFP2RegClassID:
+  case ARM::DPR_8RegClassID:
     AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg)
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
-  } else if (RC == ARM::QPRRegisterClass ||
-             RC == ARM::QPR_VFP2RegisterClass ||
-             RC == ARM::QPR_8RegisterClass) {
+    break;
+  case ARM::QPRRegClassID:
+  case ARM::QPR_VFP2RegClassID:
+  case ARM::QPR_8RegClassID:
     if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
       AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q), DestReg)
-                     .addFrameIndex(FI).addImm(128)
+                     .addFrameIndex(FI).addImm(16)
                      .addMemOperand(MMO));
     } else {
       AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQ), DestReg)
@@ -913,14 +896,16 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))
                      .addMemOperand(MMO));
     }
-  } else if (RC == ARM::QQPRRegisterClass || RC == ARM::QQPR_VFP2RegisterClass){
+    break;
+  case ARM::QQPRRegClassID:
+  case ARM::QQPR_VFP2RegClassID:
     if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
-      MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLD2q32));
+      MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLD1d64Q));
       MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI);
-      AddDefaultPred(MIB.addFrameIndex(FI).addImm(128).addMemOperand(MMO));
+      AddDefaultPred(MIB.addFrameIndex(FI).addImm(16).addMemOperand(MMO));
     } else {
       MachineInstrBuilder MIB =
         AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMD))
@@ -932,21 +917,25 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI);
             AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI);
     }
-  } else {
-    assert(RC == ARM::QQQQPRRegisterClass && "Unknown regclass!");
-      MachineInstrBuilder MIB =
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMD))
-                       .addFrameIndex(FI)
-                       .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)))
-        .addMemOperand(MMO);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::Define, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::Define, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::Define, TRI);
-            AddDReg(MIB, DestReg, ARM::dsub_7, RegState::Define, TRI);
+    break;
+  case ARM::QQQQPRRegClassID: {
+    MachineInstrBuilder MIB =
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMD))
+                     .addFrameIndex(FI)
+                     .addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4)))
+      .addMemOperand(MMO);
+    MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI);
+    MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI);
+    MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI);
+    MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI);
+    MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::Define, TRI);
+    MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::Define, TRI);
+    MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::Define, TRI);
+    AddDReg(MIB, DestReg, ARM::dsub_7, RegState::Define, TRI);
+    break;
+  }
+  default:
+    llvm_unreachable("Unknown regclass!");
   }
 }
 
@@ -960,223 +949,6 @@ ARMBaseInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
   return &*MIB;
 }
 
-MachineInstr *ARMBaseInstrInfo::
-foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                      const SmallVectorImpl<unsigned> &Ops, int FI) const {
-  if (Ops.size() != 1) return NULL;
-
-  unsigned OpNum = Ops[0];
-  unsigned Opc = MI->getOpcode();
-  MachineInstr *NewMI = NULL;
-  if (Opc == ARM::MOVr || Opc == ARM::t2MOVr) {
-    // If it is updating CPSR, then it cannot be folded.
-    if (MI->getOperand(4).getReg() == ARM::CPSR && !MI->getOperand(4).isDead())
-      return NULL;
-    unsigned Pred = MI->getOperand(2).getImm();
-    unsigned PredReg = MI->getOperand(3).getReg();
-    if (OpNum == 0) { // move -> store
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      unsigned SrcSubReg = MI->getOperand(1).getSubReg();
-      bool isKill = MI->getOperand(1).isKill();
-      bool isUndef = MI->getOperand(1).isUndef();
-      if (Opc == ARM::MOVr)
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::STR))
-          .addReg(SrcReg,
-                  getKillRegState(isKill) | getUndefRegState(isUndef),
-                  SrcSubReg)
-          .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
-      else // ARM::t2MOVr
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2STRi12))
-          .addReg(SrcReg,
-                  getKillRegState(isKill) | getUndefRegState(isUndef),
-                  SrcSubReg)
-          .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
-    } else {          // move -> load
-      unsigned DstReg = MI->getOperand(0).getReg();
-      unsigned DstSubReg = MI->getOperand(0).getSubReg();
-      bool isDead = MI->getOperand(0).isDead();
-      bool isUndef = MI->getOperand(0).isUndef();
-      if (Opc == ARM::MOVr)
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::LDR))
-          .addReg(DstReg,
-                  RegState::Define |
-                  getDeadRegState(isDead) |
-                  getUndefRegState(isUndef), DstSubReg)
-          .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
-      else // ARM::t2MOVr
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2LDRi12))
-          .addReg(DstReg,
-                  RegState::Define |
-                  getDeadRegState(isDead) |
-                  getUndefRegState(isUndef), DstSubReg)
-          .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
-    }
-  } else if (Opc == ARM::tMOVgpr2gpr ||
-             Opc == ARM::tMOVtgpr2gpr ||
-             Opc == ARM::tMOVgpr2tgpr) {
-    if (OpNum == 0) { // move -> store
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      unsigned SrcSubReg = MI->getOperand(1).getSubReg();
-      bool isKill = MI->getOperand(1).isKill();
-      bool isUndef = MI->getOperand(1).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2STRi12))
-        .addReg(SrcReg,
-                getKillRegState(isKill) | getUndefRegState(isUndef),
-                SrcSubReg)
-        .addFrameIndex(FI).addImm(0).addImm(ARMCC::AL).addReg(0);
-    } else {          // move -> load
-      unsigned DstReg = MI->getOperand(0).getReg();
-      unsigned DstSubReg = MI->getOperand(0).getSubReg();
-      bool isDead = MI->getOperand(0).isDead();
-      bool isUndef = MI->getOperand(0).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2LDRi12))
-        .addReg(DstReg,
-                RegState::Define |
-                getDeadRegState(isDead) |
-                getUndefRegState(isUndef),
-                DstSubReg)
-        .addFrameIndex(FI).addImm(0).addImm(ARMCC::AL).addReg(0);
-    }
-  } else if (Opc == ARM::VMOVS) {
-    unsigned Pred = MI->getOperand(2).getImm();
-    unsigned PredReg = MI->getOperand(3).getReg();
-    if (OpNum == 0) { // move -> store
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      unsigned SrcSubReg = MI->getOperand(1).getSubReg();
-      bool isKill = MI->getOperand(1).isKill();
-      bool isUndef = MI->getOperand(1).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VSTRS))
-        .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef),
-                SrcSubReg)
-        .addFrameIndex(FI)
-        .addImm(0).addImm(Pred).addReg(PredReg);
-    } else {          // move -> load
-      unsigned DstReg = MI->getOperand(0).getReg();
-      unsigned DstSubReg = MI->getOperand(0).getSubReg();
-      bool isDead = MI->getOperand(0).isDead();
-      bool isUndef = MI->getOperand(0).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VLDRS))
-        .addReg(DstReg,
-                RegState::Define |
-                getDeadRegState(isDead) |
-                getUndefRegState(isUndef),
-                DstSubReg)
-        .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
-    }
-  } else if (Opc == ARM::VMOVD || Opc == ARM::VMOVDneon) {
-    unsigned Pred = MI->getOperand(2).getImm();
-    unsigned PredReg = MI->getOperand(3).getReg();
-    if (OpNum == 0) { // move -> store
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      unsigned SrcSubReg = MI->getOperand(1).getSubReg();
-      bool isKill = MI->getOperand(1).isKill();
-      bool isUndef = MI->getOperand(1).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VSTRD))
-        .addReg(SrcReg,
-                getKillRegState(isKill) | getUndefRegState(isUndef),
-                SrcSubReg)
-        .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
-    } else {          // move -> load
-      unsigned DstReg = MI->getOperand(0).getReg();
-      unsigned DstSubReg = MI->getOperand(0).getSubReg();
-      bool isDead = MI->getOperand(0).isDead();
-      bool isUndef = MI->getOperand(0).isUndef();
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VLDRD))
-        .addReg(DstReg,
-                RegState::Define |
-                getDeadRegState(isDead) |
-                getUndefRegState(isUndef),
-                DstSubReg)
-        .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
-    }
-  }  else if (Opc == ARM::VMOVQ) {
-    MachineFrameInfo &MFI = *MF.getFrameInfo();
-    unsigned Pred = MI->getOperand(2).getImm();
-    unsigned PredReg = MI->getOperand(3).getReg();
-    if (OpNum == 0) { // move -> store
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      unsigned SrcSubReg = MI->getOperand(1).getSubReg();
-      bool isKill = MI->getOperand(1).isKill();
-      bool isUndef = MI->getOperand(1).isUndef();
-      if (MFI.getObjectAlignment(FI) >= 16 &&
-          getRegisterInfo().canRealignStack(MF)) {
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VST1q))
-          .addFrameIndex(FI).addImm(128)
-          .addReg(SrcReg,
-                  getKillRegState(isKill) | getUndefRegState(isUndef),
-                  SrcSubReg)
-          .addImm(Pred).addReg(PredReg);
-      } else {
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VSTMQ))
-          .addReg(SrcReg,
-                  getKillRegState(isKill) | getUndefRegState(isUndef),
-                  SrcSubReg)
-          .addFrameIndex(FI).addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))
-          .addImm(Pred).addReg(PredReg);
-      }
-    } else {          // move -> load
-      unsigned DstReg = MI->getOperand(0).getReg();
-      unsigned DstSubReg = MI->getOperand(0).getSubReg();
-      bool isDead = MI->getOperand(0).isDead();
-      bool isUndef = MI->getOperand(0).isUndef();
-      if (MFI.getObjectAlignment(FI) >= 16 &&
-          getRegisterInfo().canRealignStack(MF)) {
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VLD1q))
-          .addReg(DstReg,
-                  RegState::Define |
-                  getDeadRegState(isDead) |
-                  getUndefRegState(isUndef),
-                  DstSubReg)
-          .addFrameIndex(FI).addImm(128).addImm(Pred).addReg(PredReg);
-      } else {
-        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VLDMQ))
-          .addReg(DstReg,
-                  RegState::Define |
-                  getDeadRegState(isDead) |
-                  getUndefRegState(isUndef),
-                  DstSubReg)
-          .addFrameIndex(FI).addImm(ARM_AM::getAM5Opc(ARM_AM::ia, 4))
-          .addImm(Pred).addReg(PredReg);
-      }
-    }
-  }
-
-  return NewMI;
-}
-
-MachineInstr*
-ARMBaseInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                        MachineInstr* MI,
-                                        const SmallVectorImpl<unsigned> &Ops,
-                                        MachineInstr* LoadMI) const {
-  // FIXME
-  return 0;
-}
-
-bool
-ARMBaseInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
-                                   const SmallVectorImpl<unsigned> &Ops) const {
-  if (Ops.size() != 1) return false;
-
-  unsigned Opc = MI->getOpcode();
-  if (Opc == ARM::MOVr || Opc == ARM::t2MOVr) {
-    // If it is updating CPSR, then it cannot be folded.
-    return MI->getOperand(4).getReg() != ARM::CPSR ||
-      MI->getOperand(4).isDead();
-  } else if (Opc == ARM::tMOVgpr2gpr ||
-             Opc == ARM::tMOVtgpr2gpr ||
-             Opc == ARM::tMOVgpr2tgpr) {
-    return true;
-  } else if (Opc == ARM::VMOVS || Opc == ARM::VMOVD ||
-             Opc == ARM::VMOVDneon || Opc == ARM::VMOVQ) {
-    return true;
-  }
-
-  // FIXME: VMOVQQ and VMOVQQQQ?
-
-  return false;
-}
-
 /// Create a copy of a const pool value. Update CPI to the new index and return
 /// the label UID.
 static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
@@ -1211,17 +983,12 @@ reMaterialize(MachineBasicBlock &MBB,
               MachineBasicBlock::iterator I,
               unsigned DestReg, unsigned SubIdx,
               const MachineInstr *Orig,
-              const TargetRegisterInfo *TRI) const {
-  if (SubIdx && TargetRegisterInfo::isPhysicalRegister(DestReg)) {
-    DestReg = TRI->getSubReg(DestReg, SubIdx);
-    SubIdx = 0;
-  }
-
+              const TargetRegisterInfo &TRI) const {
   unsigned Opcode = Orig->getOpcode();
   switch (Opcode) {
   default: {
     MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
-    MI->getOperand(0).setReg(DestReg);
+    MI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI);
     MBB.insert(I, MI);
     break;
   }
@@ -1237,9 +1004,6 @@ reMaterialize(MachineBasicBlock &MBB,
     break;
   }
   }
-
-  MachineInstr *NewMI = prior(I);
-  NewMI->getOperand(0).setSubReg(SubIdx);
 }
 
 MachineInstr *
@@ -1291,6 +1055,165 @@ bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
   return MI0->isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs);
 }
 
+/// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to
+/// determine if two loads are loading from the same base address. It should
+/// only return true if the base pointers are the same and the only differences
+/// between the two addresses is the offset. It also returns the offsets by
+/// reference.
+bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                                               int64_t &Offset1,
+                                               int64_t &Offset2) const {
+  // Don't worry about Thumb: just ARM and Thumb2.
+  if (Subtarget.isThumb1Only()) return false;
+
+  if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
+    return false;
+
+  switch (Load1->getMachineOpcode()) {
+  default:
+    return false;
+  case ARM::LDR:
+  case ARM::LDRB:
+  case ARM::LDRD:
+  case ARM::LDRH:
+  case ARM::LDRSB:
+  case ARM::LDRSH:
+  case ARM::VLDRD:
+  case ARM::VLDRS:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRDi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRi12:
+  case ARM::t2LDRSHi12:
+    break;
+  }
+
+  switch (Load2->getMachineOpcode()) {
+  default:
+    return false;
+  case ARM::LDR:
+  case ARM::LDRB:
+  case ARM::LDRD:
+  case ARM::LDRH:
+  case ARM::LDRSB:
+  case ARM::LDRSH:
+  case ARM::VLDRD:
+  case ARM::VLDRS:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRDi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRi12:
+  case ARM::t2LDRSHi12:
+    break;
+  }
+
+  // Check if base addresses and chain operands match.
+  if (Load1->getOperand(0) != Load2->getOperand(0) ||
+      Load1->getOperand(4) != Load2->getOperand(4))
+    return false;
+
+  // Index should be Reg0.
+  if (Load1->getOperand(3) != Load2->getOperand(3))
+    return false;
+
+  // Determine the offsets.
+  if (isa<ConstantSDNode>(Load1->getOperand(1)) &&
+      isa<ConstantSDNode>(Load2->getOperand(1))) {
+    Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getSExtValue();
+    Offset2 = cast<ConstantSDNode>(Load2->getOperand(1))->getSExtValue();
+    return true;
+  }
+
+  return false;
+}
+
+/// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+/// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should
+/// be scheduled togther. On some targets if two loads are loading from
+/// addresses in the same cache line, it's better if they are scheduled
+/// together. This function takes two integers that represent the load offsets
+/// from the common base address. It returns true if it decides it's desirable
+/// to schedule the two loads together. "NumLoads" is the number of loads that
+/// have already been scheduled after Load1.
+bool ARMBaseInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                               int64_t Offset1, int64_t Offset2,
+                                               unsigned NumLoads) const {
+  // Don't worry about Thumb: just ARM and Thumb2.
+  if (Subtarget.isThumb1Only()) return false;
+
+  assert(Offset2 > Offset1);
+
+  if ((Offset2 - Offset1) / 8 > 64)
+    return false;
+
+  if (Load1->getMachineOpcode() != Load2->getMachineOpcode())
+    return false;  // FIXME: overly conservative?
+
+  // Four loads in a row should be sufficient.
+  if (NumLoads >= 3)
+    return false;
+
+  return true;
+}
+
+bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
+                                            const MachineBasicBlock *MBB,
+                                            const MachineFunction &MF) const {
+  // Debug info is never a scheduling boundary. It's necessary to be explicit
+  // due to the special treatment of IT instructions below, otherwise a
+  // dbg_value followed by an IT will result in the IT instruction being
+  // considered a scheduling hazard, which is wrong. It should be the actual
+  // instruction preceding the dbg_value instruction(s), just like it is
+  // when debug info is not present.
+  if (MI->isDebugValue())
+    return false;
+
+  // Terminators and labels can't be scheduled around.
+  if (MI->getDesc().isTerminator() || MI->isLabel())
+    return true;
+
+  // Treat the start of the IT block as a scheduling boundary, but schedule
+  // t2IT along with all instructions following it.
+  // FIXME: This is a big hammer. But the alternative is to add all potential
+  // true and anti dependencies to IT block instructions as implicit operands
+  // to the t2IT instruction. The added compile time and complexity does not
+  // seem worth it.
+  MachineBasicBlock::const_iterator I = MI;
+  // Make sure to skip any dbg_value instructions
+  while (++I != MBB->end() && I->isDebugValue())
+    ;
+  if (I != MBB->end() && I->getOpcode() == ARM::t2IT)
+    return true;
+
+  // Don't attempt to schedule around any instruction that defines
+  // a stack-oriented pointer, as it's unlikely to be profitable. This
+  // saves compile time, because it doesn't require every single
+  // stack slot reference to depend on the instruction that does the
+  // modification.
+  if (MI->definesRegister(ARM::SP))
+    return true;
+
+  return false;
+}
+
+bool ARMBaseInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumInstrs) const {
+  if (!NumInstrs)
+    return false;
+  if (Subtarget.getCPUString() == "generic")
+    // Generic (and overly aggressive) if-conversion limits for testing.
+    return NumInstrs <= 10;
+  else if (Subtarget.hasV7Ops())
+    return NumInstrs <= 3;
+  return NumInstrs <= 2;
+}
+  
+bool ARMBaseInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT,
+                    MachineBasicBlock &FMBB, unsigned NumF) const {
+  return NumT && NumF && NumT <= 2 && NumF <= 2;
+}
+
 /// getInstrPredicate - If instruction is predicated, returns its predicate
 /// condition, otherwise returns AL. It also returns the condition code
 /// register by reference.
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index b566271c8db9..89a2db74a75e 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -116,11 +116,25 @@ namespace ARMII {
     // Thumb format
     ThumbFrm      = 24 << FormShift,
 
-    // NEON format
-    NEONFrm       = 25 << FormShift,
-    NEONGetLnFrm  = 26 << FormShift,
-    NEONSetLnFrm  = 27 << FormShift,
-    NEONDupFrm    = 28 << FormShift,
+    // Miscelleaneous format
+    MiscFrm       = 25 << FormShift,
+
+    // NEON formats
+    NGetLnFrm     = 26 << FormShift,
+    NSetLnFrm     = 27 << FormShift,
+    NDupFrm       = 28 << FormShift,
+    NLdStFrm      = 29 << FormShift,
+    N1RegModImmFrm= 30 << FormShift,
+    N2RegFrm      = 31 << FormShift,
+    NVCVTFrm      = 32 << FormShift,
+    NVDupLnFrm    = 33 << FormShift,
+    N2RegVShLFrm  = 34 << FormShift,
+    N2RegVShRFrm  = 35 << FormShift,
+    N3RegFrm      = 36 << FormShift,
+    N3RegVShFrm   = 37 << FormShift,
+    NVExtFrm      = 38 << FormShift,
+    NVMulSLFrm    = 39 << FormShift,
+    NVTBLFrm      = 40 << FormShift,
 
     //===------------------------------------------------------------------===//
     // Misc flags.
@@ -213,7 +227,8 @@ public:
   virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
   virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                                 MachineBasicBlock *FBB,
-                            const SmallVectorImpl<MachineOperand> &Cond) const;
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
 
   virtual
   bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
@@ -258,12 +273,10 @@ public:
   virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
                                       int &FrameIndex) const;
 
-  virtual bool copyRegToReg(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator I,
-                            unsigned DestReg, unsigned SrcReg,
-                            const TargetRegisterClass *DestRC,
-                            const TargetRegisterClass *SrcRC,
-                            DebugLoc DL) const;
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
 
   virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
@@ -283,29 +296,51 @@ public:
                                                  const MDNode *MDPtr,
                                                  DebugLoc DL) const;
 
-  virtual bool canFoldMemoryOperand(const MachineInstr *MI,
-                                    const SmallVectorImpl<unsigned> &Ops) const;
-
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                              const SmallVectorImpl<unsigned> &Ops,
-                                              int FrameIndex) const;
-
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                              MachineInstr* LoadMI) const;
-
   virtual void reMaterialize(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator MI,
                              unsigned DestReg, unsigned SubIdx,
                              const MachineInstr *Orig,
-                             const TargetRegisterInfo *TRI) const;
+                             const TargetRegisterInfo &TRI) const;
 
   MachineInstr *duplicate(MachineInstr *Orig, MachineFunction &MF) const;
 
   virtual bool produceSameValue(const MachineInstr *MI0,
                                 const MachineInstr *MI1) const;
+
+  /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to
+  /// determine if two loads are loading from the same base address. It should
+  /// only return true if the base pointers are the same and the only
+  /// differences between the two addresses is the offset. It also returns the
+  /// offsets by reference.
+  virtual bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                                       int64_t &Offset1, int64_t &Offset2)const;
+
+  /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+  /// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should
+  /// be scheduled togther. On some targets if two loads are loading from
+  /// addresses in the same cache line, it's better if they are scheduled
+  /// together. This function takes two integers that represent the load offsets
+  /// from the common base address. It returns true if it decides it's desirable
+  /// to schedule the two loads together. "NumLoads" is the number of loads that
+  /// have already been scheduled after Load1.
+  virtual bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                       int64_t Offset1, int64_t Offset2,
+                                       unsigned NumLoads) const;
+
+  virtual bool isSchedulingBoundary(const MachineInstr *MI,
+                                    const MachineBasicBlock *MBB,
+                                    const MachineFunction &MF) const;
+
+  virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB,
+                                   unsigned NumInstrs) const;
+
+  virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,unsigned NumT,
+                                   MachineBasicBlock &FMBB,unsigned NumF) const;
+
+  virtual bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
+                                         unsigned NumInstrs) const {
+    return NumInstrs && NumInstrs == 1;
+  }
 };
 
 static inline
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 82458d2347cc..182bd9937145 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -170,56 +170,6 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   return STI.isTargetDarwin() ? DarwinCalleeSavedRegs : CalleeSavedRegs;
 }
 
-const TargetRegisterClass* const *
-ARMBaseRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
-  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
-
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    0
-  };
-
-  static const TargetRegisterClass * const ThumbCalleeSavedRegClasses[] = {
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::tGPRRegClass,
-    &ARM::tGPRRegClass,&ARM::tGPRRegClass,&ARM::tGPRRegClass,
-
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    0
-  };
-
-  static const TargetRegisterClass * const DarwinCalleeSavedRegClasses[] = {
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
-    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
-    &ARM::GPRRegClass, &ARM::GPRRegClass,
-
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    0
-  };
-
-  static const TargetRegisterClass * const DarwinThumbCalleeSavedRegClasses[] ={
-    &ARM::GPRRegClass,  &ARM::tGPRRegClass, &ARM::tGPRRegClass,
-    &ARM::tGPRRegClass, &ARM::tGPRRegClass, &ARM::GPRRegClass,
-    &ARM::GPRRegClass,  &ARM::GPRRegClass,
-
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
-    0
-  };
-
-  if (STI.isThumb1Only()) {
-    return STI.isTargetDarwin()
-      ? DarwinThumbCalleeSavedRegClasses : ThumbCalleeSavedRegClasses;
-  }
-  return STI.isTargetDarwin()
-    ? DarwinCalleeSavedRegClasses : CalleeSavedRegClasses;
-}
-
 BitVector ARMBaseRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
   // FIXME: avoid re-calculating this everytime.
@@ -352,7 +302,7 @@ ARMBaseRegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
 }
 
 bool
-ARMBaseRegisterInfo::canCombinedSubRegIndex(const TargetRegisterClass *RC,
+ARMBaseRegisterInfo::canCombineSubRegIndices(const TargetRegisterClass *RC,
                                           SmallVectorImpl<unsigned> &SubIndices,
                                           unsigned &NewSubIdx) const {
 
@@ -724,6 +674,15 @@ ARMBaseRegisterInfo::estimateRSStackSizeLimit(MachineFunction &MF) const {
          I != E; ++I) {
       for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
         if (!I->getOperand(i).isFI()) continue;
+
+        // When using ADDri to get the address of a stack object, 255 is the
+        // largest offset guaranteed to fit in the immediate offset.
+        if (I->getOpcode() == ARM::ADDri) {
+          Limit = std::min(Limit, (1U << 8) - 1);
+          break;
+        }
+
+        // Otherwise check the addressing mode.
         switch (I->getDesc().TSFlags & ARMII::AddrModeMask) {
         case ARMII::AddrMode3:
         case ARMII::AddrModeT2_i8:
@@ -765,6 +724,7 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   SmallVector<unsigned, 4> UnspilledCS1GPRs;
   SmallVector<unsigned, 4> UnspilledCS2GPRs;
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
 
   // Spill R4 if Thumb2 function requires stack realignment - it will be used as
   // scratch register.
@@ -780,7 +740,6 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // Don't spill FP if the frame can be eliminated. This is determined
   // by scanning the callee-save registers to see if any is used.
   const unsigned *CSRegs = getCalleeSavedRegs();
-  const TargetRegisterClass* const *CSRegClasses = getCalleeSavedRegClasses();
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
     bool Spilled = false;
@@ -798,50 +757,50 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
       }
     }
 
-    if (CSRegClasses[i] == ARM::GPRRegisterClass ||
-        CSRegClasses[i] == ARM::tGPRRegisterClass) {
-      if (Spilled) {
-        NumGPRSpills++;
+    if (!ARM::GPRRegisterClass->contains(Reg))
+      continue;
 
-        if (!STI.isTargetDarwin()) {
-          if (Reg == ARM::LR)
-            LRSpilled = true;
-          CS1Spilled = true;
-          continue;
-        }
+    if (Spilled) {
+      NumGPRSpills++;
 
-        // Keep track if LR and any of R4, R5, R6, and R7 is spilled.
-        switch (Reg) {
-        case ARM::LR:
+      if (!STI.isTargetDarwin()) {
+        if (Reg == ARM::LR)
           LRSpilled = true;
-          // Fallthrough
-        case ARM::R4:
-        case ARM::R5:
-        case ARM::R6:
-        case ARM::R7:
-          CS1Spilled = true;
-          break;
-        default:
-          break;
-        }
-      } else {
-        if (!STI.isTargetDarwin()) {
-          UnspilledCS1GPRs.push_back(Reg);
-          continue;
-        }
+        CS1Spilled = true;
+        continue;
+      }
 
-        switch (Reg) {
-        case ARM::R4:
-        case ARM::R5:
-        case ARM::R6:
-        case ARM::R7:
-        case ARM::LR:
-          UnspilledCS1GPRs.push_back(Reg);
-          break;
-        default:
-          UnspilledCS2GPRs.push_back(Reg);
-          break;
-        }
+      // Keep track if LR and any of R4, R5, R6, and R7 is spilled.
+      switch (Reg) {
+      case ARM::LR:
+        LRSpilled = true;
+        // Fallthrough
+      case ARM::R4:
+      case ARM::R5:
+      case ARM::R6:
+      case ARM::R7:
+        CS1Spilled = true;
+        break;
+      default:
+        break;
+      }
+    } else {
+      if (!STI.isTargetDarwin()) {
+        UnspilledCS1GPRs.push_back(Reg);
+        continue;
+      }
+
+      switch (Reg) {
+      case ARM::R4:
+      case ARM::R5:
+      case ARM::R6:
+      case ARM::R7:
+      case ARM::LR:
+        UnspilledCS1GPRs.push_back(Reg);
+        break;
+      default:
+        UnspilledCS2GPRs.push_back(Reg);
+        break;
       }
     }
   }
@@ -862,9 +821,16 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // offset, make sure a register (or a spill slot) is available for the
   // register scavenger. Note that if we're indexing off the frame pointer, the
   // effective stack size is 4 bytes larger since the FP points to the stack
-  // slot of the previous FP.
-  bool BigStack = RS &&
-    estimateStackSize(MF) + (hasFP(MF) ? 4 : 0) >= estimateRSStackSizeLimit(MF);
+  // slot of the previous FP. Also, if we have variable sized objects in the
+  // function, stack slot references will often be negative, and some of
+  // our instructions are positive-offset only, so conservatively consider
+  // that case to want a spill slot (or register) as well.
+  // FIXME: We could add logic to be more precise about negative offsets
+  //        and which instructions will need a scratch register for them. Is it
+  //        worth the effort and added fragility?
+  bool BigStack =
+    (RS && (estimateStackSize(MF) + (hasFP(MF) ? 4:0) >=
+            estimateRSStackSizeLimit(MF))) || MFI->hasVarSizedObjects();
 
   bool ExtraCSSpill = false;
   if (BigStack || !CanEliminateFrame || cannotEliminateFrame(MF)) {
@@ -957,7 +923,6 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
         // note: Thumb1 functions spill to R12, not the stack.  Reserve a slot
         // closest to SP or frame pointer.
         const TargetRegisterClass *RC = ARM::GPRRegisterClass;
-        MachineFrameInfo *MFI = MF.getFrameInfo();
         RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
                                                            RC->getAlignment(),
                                                            false));
@@ -1622,6 +1587,7 @@ emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = prior(MBB.end());
   assert(MBBI->getDesc().isReturn() &&
          "Can only insert epilog into returning blocks");
+  unsigned RetOpcode = MBBI->getOpcode();
   DebugLoc dl = MBBI->getDebugLoc();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -1696,6 +1662,39 @@ emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {
     emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getGPRCalleeSavedArea1Size());
   }
 
+  if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNdiND ||
+      RetOpcode == ARM::TCRETURNri || RetOpcode == ARM::TCRETURNriND) {
+    // Tail call return: adjust the stack pointer and jump to callee.
+    MBBI = prior(MBB.end());
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+
+    // Jump to label or value in register.
+    if (RetOpcode == ARM::TCRETURNdi) {
+      BuildMI(MBB, MBBI, dl, 
+            TII.get(STI.isThumb() ? ARM::TAILJMPdt : ARM::TAILJMPd)).
+        addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+                         JumpTarget.getTargetFlags());
+    } else if (RetOpcode == ARM::TCRETURNdiND) {
+      BuildMI(MBB, MBBI, dl,
+            TII.get(STI.isThumb() ? ARM::TAILJMPdNDt : ARM::TAILJMPdND)).
+        addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+                         JumpTarget.getTargetFlags());
+    } else if (RetOpcode == ARM::TCRETURNri) {
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::TAILJMPr)).
+        addReg(JumpTarget.getReg(), RegState::Kill);
+    } else if (RetOpcode == ARM::TCRETURNriND) {
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::TAILJMPrND)).
+        addReg(JumpTarget.getReg(), RegState::Kill);
+    } 
+
+    MachineInstr *NewMI = prior(MBBI);
+    for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
+      NewMI->addOperand(MBBI->getOperand(i));
+
+    // Delete the pseudo instruction TCRETURN.
+    MBB.erase(MBBI);
+  }
+
   if (VARegSaveSize)
     emitSPUpdate(isARM, MBB, MBBI, dl, TII, VARegSaveSize);
 }
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index 2c9c82d0318b..f7ee0d5cc66d 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -69,9 +69,6 @@ public:
   /// Code Generation virtual methods...
   const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
 
-  const TargetRegisterClass* const*
-  getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
-
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
   /// getMatchingSuperRegClass - Return a subclass of the specified register
@@ -81,14 +78,15 @@ public:
   getMatchingSuperRegClass(const TargetRegisterClass *A,
                            const TargetRegisterClass *B, unsigned Idx) const;
 
-  /// canCombinedSubRegIndex - Given a register class and a list of sub-register
-  /// indices, return true if it's possible to combine the sub-register indices
-  /// into one that corresponds to a larger sub-register. Return the new sub-
-  /// register index by reference. Note the new index by be zero if the given
-  /// sub-registers combined to form the whole register.
-  virtual bool canCombinedSubRegIndex(const TargetRegisterClass *RC,
-                                      SmallVectorImpl<unsigned> &SubIndices,
-                                      unsigned &NewSubIdx) const;
+  /// canCombineSubRegIndices - Given a register class and a list of
+  /// subregister indices, return true if it's possible to combine the
+  /// subregister indices into one that corresponds to a larger
+  /// subregister. Return the new subregister index by reference. Note the
+  /// new index may be zero if the given subregisters can be combined to
+  /// form the whole register.
+  virtual bool canCombineSubRegIndices(const TargetRegisterClass *RC,
+                                       SmallVectorImpl<unsigned> &SubIndices,
+                                       unsigned &NewSubIdx) const;
 
   const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
 
@@ -150,8 +148,8 @@ public:
   virtual bool canSimplifyCallFramePseudos(MachineFunction &MF) const;
 
   virtual void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                             MachineBasicBlock &MBB,
-                                             MachineBasicBlock::iterator I) const;
+                                           MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator I) const;
 
   virtual unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
                                        int SPAdj, FrameIndexValue *Value = NULL,
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index f2730fc8a5cb..7895cb071922 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -55,6 +55,7 @@ namespace {
     const std::vector<MachineConstantPoolEntry> *MCPEs;
     const std::vector<MachineJumpTableEntry> *MJTEs;
     bool IsPIC;
+    bool IsThumb;
 
     void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<MachineModuleInfo>();
@@ -67,8 +68,8 @@ namespace {
       : MachineFunctionPass(&ID), JTI(0),
         II((const ARMInstrInfo *)tm.getInstrInfo()),
         TD(tm.getTargetData()), TM(tm),
-    MCE(mce), MCPEs(0), MJTEs(0),
-    IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+        MCE(mce), MCPEs(0), MJTEs(0),
+        IsPIC(TM.getRelocationModel() == Reloc::PIC_), IsThumb(false) {}
 
     /// getBinaryCodeForInstr - This function, generated by the
     /// CodeEmitterGenerator using TableGen, produces the binary encoding for
@@ -139,6 +140,12 @@ namespace {
 
     void emitMiscInstruction(const MachineInstr &MI);
 
+    void emitNEONLaneInstruction(const MachineInstr &MI);
+    void emitNEONDupInstruction(const MachineInstr &MI);
+    void emitNEON1RegModImmInstruction(const MachineInstr &MI);
+    void emitNEON2RegInstruction(const MachineInstr &MI);
+    void emitNEON3RegInstruction(const MachineInstr &MI);
+
     /// getMachineOpValue - Return binary encoding of operand. If the machine
     /// operand requires relocation, record the relocation and return zero.
     unsigned getMachineOpValue(const MachineInstr &MI,const MachineOperand &MO);
@@ -147,7 +154,8 @@ namespace {
     }
 
     /// getMovi32Value - Return binary encoding of operand for movw/movt. If the
-    /// machine operand requires relocation, record the relocation and return zero.
+    /// machine operand requires relocation, record the relocation and return
+    /// zero.
     unsigned getMovi32Value(const MachineInstr &MI,const MachineOperand &MO,
                             unsigned Reloc);
     unsigned getMovi32Value(const MachineInstr &MI, unsigned OpIdx,
@@ -193,6 +201,7 @@ bool ARMCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
   MJTEs = 0;
   if (MF.getJumpTableInfo()) MJTEs = &MF.getJumpTableInfo()->getJumpTables();
   IsPIC = TM.getRelocationModel() == Reloc::PIC_;
+  IsThumb = MF.getInfo<ARMFunctionInfo>()->isThumbFunction();
   JTI->Initialize(MF, IsPIC);
   MMI = &getAnalysis<MachineModuleInfo>();
   MCE.setModuleInfo(MMI);
@@ -347,7 +356,7 @@ void ARMCodeEmitter::emitInstruction(const MachineInstr &MI) {
 
   MCE.processDebugLoc(MI.getDebugLoc(), true);
 
-  NumEmitted++;  // Keep track of the # of mi's emitted
+  ++NumEmitted;  // Keep track of the # of mi's emitted
   switch (MI.getDesc().TSFlags & ARMII::FormMask) {
   default: {
     llvm_unreachable("Unhandled instruction encoding format!");
@@ -407,6 +416,23 @@ void ARMCodeEmitter::emitInstruction(const MachineInstr &MI) {
   case ARMII::VFPMiscFrm:
     emitMiscInstruction(MI);
     break;
+  // NEON instructions.
+  case ARMII::NGetLnFrm:
+  case ARMII::NSetLnFrm:
+    emitNEONLaneInstruction(MI);
+    break;
+  case ARMII::NDupFrm:
+    emitNEONDupInstruction(MI);
+    break;
+  case ARMII::N1RegModImmFrm:
+    emitNEON1RegModImmInstruction(MI);
+    break;
+  case ARMII::N2RegFrm:
+    emitNEON2RegInstruction(MI);
+    break;
+  case ARMII::N3RegFrm:
+    emitNEON3RegInstruction(MI);
+    break;
   }
   MCE.processDebugLoc(MI.getDebugLoc(), false);
 }
@@ -1539,4 +1565,144 @@ void ARMCodeEmitter::emitMiscInstruction(const MachineInstr &MI) {
   emitWordLE(Binary);
 }
 
+static unsigned encodeNEONRd(const MachineInstr &MI, unsigned OpIdx) {
+  unsigned RegD = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  RegD = ARMRegisterInfo::getRegisterNumbering(RegD);
+  Binary |= (RegD & 0xf) << ARMII::RegRdShift;
+  Binary |= ((RegD >> 4) & 1) << ARMII::D_BitShift;
+  return Binary;
+}
+
+static unsigned encodeNEONRn(const MachineInstr &MI, unsigned OpIdx) {
+  unsigned RegN = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  RegN = ARMRegisterInfo::getRegisterNumbering(RegN);
+  Binary |= (RegN & 0xf) << ARMII::RegRnShift;
+  Binary |= ((RegN >> 4) & 1) << ARMII::N_BitShift;
+  return Binary;
+}
+
+static unsigned encodeNEONRm(const MachineInstr &MI, unsigned OpIdx) {
+  unsigned RegM = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  RegM = ARMRegisterInfo::getRegisterNumbering(RegM);
+  Binary |= (RegM & 0xf);
+  Binary |= ((RegM >> 4) & 1) << ARMII::M_BitShift;
+  return Binary;
+}
+
+/// convertNEONDataProcToThumb - Convert the ARM mode encoding for a NEON
+/// data-processing instruction to the corresponding Thumb encoding.
+static unsigned convertNEONDataProcToThumb(unsigned Binary) {
+  assert((Binary & 0xfe000000) == 0xf2000000 &&
+         "not an ARM NEON data-processing instruction");
+  unsigned UBit = (Binary >> 24) & 1;
+  return 0xef000000 | (UBit << 28) | (Binary & 0xffffff);
+}
+
+void ARMCodeEmitter::emitNEONLaneInstruction(const MachineInstr &MI) {
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  unsigned RegTOpIdx, RegNOpIdx, LnOpIdx;
+  const TargetInstrDesc &TID = MI.getDesc();
+  if ((TID.TSFlags & ARMII::FormMask) == ARMII::NGetLnFrm) {
+    RegTOpIdx = 0;
+    RegNOpIdx = 1;
+    LnOpIdx = 2;
+  } else { // ARMII::NSetLnFrm
+    RegTOpIdx = 2;
+    RegNOpIdx = 0;
+    LnOpIdx = 3;
+  }
+
+  // Set the conditional execution predicate
+  Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift;
+
+  unsigned RegT = MI.getOperand(RegTOpIdx).getReg();
+  RegT = ARMRegisterInfo::getRegisterNumbering(RegT);
+  Binary |= (RegT << ARMII::RegRdShift);
+  Binary |= encodeNEONRn(MI, RegNOpIdx);
+
+  unsigned LaneShift;
+  if ((Binary & (1 << 22)) != 0)
+    LaneShift = 0; // 8-bit elements
+  else if ((Binary & (1 << 5)) != 0)
+    LaneShift = 1; // 16-bit elements
+  else
+    LaneShift = 2; // 32-bit elements
+
+  unsigned Lane = MI.getOperand(LnOpIdx).getImm() << LaneShift;
+  unsigned Opc1 = Lane >> 2;
+  unsigned Opc2 = Lane & 3;
+  assert((Opc1 & 3) == 0 && "out-of-range lane number operand");
+  Binary |= (Opc1 << 21);
+  Binary |= (Opc2 << 5);
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitNEONDupInstruction(const MachineInstr &MI) {
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift;
+
+  unsigned RegT = MI.getOperand(1).getReg();
+  RegT = ARMRegisterInfo::getRegisterNumbering(RegT);
+  Binary |= (RegT << ARMII::RegRdShift);
+  Binary |= encodeNEONRn(MI, 0);
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitNEON1RegModImmInstruction(const MachineInstr &MI) {
+  unsigned Binary = getBinaryCodeForInstr(MI);
+  // Destination register is encoded in Dd.
+  Binary |= encodeNEONRd(MI, 0);
+  // Immediate fields: Op, Cmode, I, Imm3, Imm4
+  unsigned Imm = MI.getOperand(1).getImm();
+  unsigned Op = (Imm >> 12) & 1;
+  unsigned Cmode = (Imm >> 8) & 0xf;
+  unsigned I = (Imm >> 7) & 1;
+  unsigned Imm3 = (Imm >> 4) & 0x7;
+  unsigned Imm4 = Imm & 0xf;
+  Binary |= (I << 24) | (Imm3 << 16) | (Cmode << 8) | (Op << 5) | Imm4;
+  if (IsThumb)
+    Binary = convertNEONDataProcToThumb(Binary);
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitNEON2RegInstruction(const MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+  unsigned Binary = getBinaryCodeForInstr(MI);
+  // Destination register is encoded in Dd; source register in Dm.
+  unsigned OpIdx = 0;
+  Binary |= encodeNEONRd(MI, OpIdx++);
+  if (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1)
+    ++OpIdx;
+  Binary |= encodeNEONRm(MI, OpIdx);
+  if (IsThumb)
+    Binary = convertNEONDataProcToThumb(Binary);
+  // FIXME: This does not handle VDUPfdf or VDUPfqf.
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitNEON3RegInstruction(const MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+  unsigned Binary = getBinaryCodeForInstr(MI);
+  // Destination register is encoded in Dd; source registers in Dn and Dm.
+  unsigned OpIdx = 0;
+  Binary |= encodeNEONRd(MI, OpIdx++);
+  if (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1)
+    ++OpIdx;
+  Binary |= encodeNEONRn(MI, OpIdx++);
+  if (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1)
+    ++OpIdx;
+  Binary |= encodeNEONRm(MI, OpIdx);
+  if (IsThumb)
+    Binary = convertNEONDataProcToThumb(Binary);
+  // FIXME: This does not handle VMOVDneon or VMOVQ.
+  emitWordLE(Binary);
+}
+
 #include "ARMGenCodeEmitter.inc"
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 13d8b74014c5..65a3da6f1617 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -337,7 +337,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
     if (CPChange && ++NoCPIters > 30)
       llvm_unreachable("Constant Island pass failed to converge!");
     DEBUG(dumpBBs());
-    
+
     // Clear NewWaterList now.  If we split a block for branches, it should
     // appear as "new water" for the next iteration of constant pool placement.
     NewWaterList.clear();
@@ -361,8 +361,8 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
   // After a while, this might be made debug-only, but it is not expensive.
   verify(MF);
 
-  // If LR has been forced spilled and no far jumps (i.e. BL) has been issued.
-  // Undo the spill / restore of LR if possible.
+  // If LR has been forced spilled and no far jump (i.e. BL) has been issued,
+  // undo the spill / restore of LR if possible.
   if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump())
     MadeChange |= UndoLRSpillRestore();
 
@@ -407,7 +407,7 @@ void ARMConstantIslands::DoInitialPlacement(MachineFunction &MF,
     std::vector<CPEntry> CPEs;
     CPEs.push_back(CPEntry(CPEMI, i));
     CPEntries.push_back(CPEs);
-    NumCPEs++;
+    ++NumCPEs;
     DEBUG(errs() << "Moved CPI#" << i << " to end of function as #" << i
                  << "\n");
   }
@@ -418,7 +418,8 @@ void ARMConstantIslands::DoInitialPlacement(MachineFunction &MF,
 static bool BBHasFallthrough(MachineBasicBlock *MBB) {
   // Get the next machine basic block in the function.
   MachineFunction::iterator MBBI = MBB;
-  if (llvm::next(MBBI) == MBB->getParent()->end())  // Can't fall off end of function.
+  // Can't fall off end of function.
+  if (llvm::next(MBBI) == MBB->getParent()->end())
     return false;
 
   MachineBasicBlock *NextBB = llvm::next(MBBI);
@@ -491,6 +492,8 @@ void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF,
     unsigned MBBSize = 0;
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
          I != E; ++I) {
+      if (I->isDebugValue())
+        continue;
       // Add instruction size to MBBSize.
       MBBSize += TII->GetInstSizeInBytes(I);
 
@@ -722,7 +725,7 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
   // correspond to anything in the source.
   unsigned Opc = isThumb ? (isThumb2 ? ARM::t2B : ARM::tB) : ARM::B;
   BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB);
-  NumSplit++;
+  ++NumSplit;
 
   // Update the CFG.  All succs of OrigBB are now succs of NewBB.
   while (!OrigBB->succ_empty()) {
@@ -945,7 +948,7 @@ bool ARMConstantIslands::DecrementOldEntry(unsigned CPI, MachineInstr *CPEMI) {
   if (--CPE->RefCount == 0) {
     RemoveDeadCPEMI(CPEMI);
     CPE->CPEMI = NULL;
-    NumCPEs--;
+    --NumCPEs;
     return true;
   }
   return false;
@@ -1246,7 +1249,7 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF,
   U.CPEMI = BuildMI(NewIsland, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY))
                 .addImm(ID).addConstantPoolIndex(CPI).addImm(Size);
   CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
-  NumCPEs++;
+  ++NumCPEs;
 
   BBOffsets[NewIsland->getNumber()] = BBOffsets[NewMBB->getNumber()];
   // Compensate for .align 2 in thumb mode.
@@ -1369,7 +1372,7 @@ ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &MF, ImmBranch &Br) {
   BBSizes[MBB->getNumber()] += 2;
   AdjustBBOffsetsAfter(MBB, 2);
   HasFarJump = true;
-  NumUBrFixed++;
+  ++NumUBrFixed;
 
   DEBUG(errs() << "  Changed B to long jump " << *MI);
 
@@ -1402,7 +1405,7 @@ ARMConstantIslands::FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br) {
   MachineInstr *BMI = &MBB->back();
   bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
 
-  NumCBrFixed++;
+  ++NumCBrFixed;
   if (BMI != MI) {
     if (llvm::next(MachineBasicBlock::iterator(MI)) == prior(MBB->end()) &&
         BMI->getOpcode() == Br.UncondBr) {
@@ -1621,7 +1624,7 @@ bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) {
   // constantpool tables?
   MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
   if (MJTI == 0) return false;
-  
+
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
   for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
     MachineInstr *MI = T2JumpTables[i];
@@ -1658,15 +1661,25 @@ bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) {
         continue;
       unsigned IdxReg = MI->getOperand(1).getReg();
       bool IdxRegKill = MI->getOperand(1).isKill();
+
+      // Scan backwards to find the instruction that defines the base
+      // register. Due to post-RA scheduling, we can't count on it
+      // immediately preceding the branch instruction.
       MachineBasicBlock::iterator PrevI = MI;
-      if (PrevI == MBB->begin())
+      MachineBasicBlock::iterator B = MBB->begin();
+      while (PrevI != B && !PrevI->definesRegister(BaseReg))
+        --PrevI;
+
+      // If for some reason we didn't find it, we can't do anything, so
+      // just skip this one.
+      if (!PrevI->definesRegister(BaseReg))
         continue;
 
-      MachineInstr *AddrMI = --PrevI;
+      MachineInstr *AddrMI = PrevI;
       bool OptOk = true;
-      // Examine the instruction that calculate the jumptable entry address.
-      // If it's not the one just before the t2BR_JT, we won't delete it, then
-      // it's not worth doing the optimization.
+      // Examine the instruction that calculates the jumptable entry address.
+      // Make sure it only defines the base register and kills any uses
+      // other than the index register.
       for (unsigned k = 0, eee = AddrMI->getNumOperands(); k != eee; ++k) {
         const MachineOperand &MO = AddrMI->getOperand(k);
         if (!MO.isReg() || !MO.getReg())
@@ -1683,9 +1696,14 @@ bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) {
       if (!OptOk)
         continue;
 
-      // The previous instruction should be a tLEApcrel or t2LEApcrelJT, we want
+      // Now scan back again to find the tLEApcrel or t2LEApcrelJT instruction
+      // that gave us the initial base register definition.
+      for (--PrevI; PrevI != B && !PrevI->definesRegister(BaseReg); --PrevI)
+        ;
+
+      // The instruction should be a tLEApcrel or t2LEApcrelJT; we want
       // to delete it as well.
-      MachineInstr *LeaMI = --PrevI;
+      MachineInstr *LeaMI = PrevI;
       if ((LeaMI->getOpcode() != ARM::tLEApcrelJT &&
            LeaMI->getOpcode() != ARM::t2LEApcrelJT) ||
           LeaMI->getOperand(0).getReg() != BaseReg)
@@ -1729,7 +1747,7 @@ bool ARMConstantIslands::ReorderThumb2JumpTables(MachineFunction &MF) {
 
   MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
   if (MJTI == 0) return false;
-  
+
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
   for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
     MachineInstr *MI = T2JumpTables[i];
@@ -1769,7 +1787,7 @@ AdjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB)
 {
   MachineFunction &MF = *BB->getParent();
 
-  // If it's the destination block is terminated by an unconditional branch,
+  // If the destination block is terminated by an unconditional branch,
   // try to move it; otherwise, create a new block following the jump
   // table that branches back to the actual target. This is a very simple
   // heuristic. FIXME: We can definitely improve it.
diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
index 6f4eddf7361d..3119b54563de 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/lib/Target/ARM/ARMConstantPoolValue.h
@@ -15,6 +15,7 @@
 #define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
 
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include <cstddef>
 
 namespace llvm {
 
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index c87f5d7c16a5..9c62597b4323 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -144,13 +144,15 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
       MachineInstrBuilder Even =
         AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
                                TII->get(ARM::VMOVQ))
-                       .addReg(EvenDst, getDefRegState(true) | getDeadRegState(DstIsDead))
-                       .addReg(EvenSrc, getKillRegState(SrcIsKill)));
+                     .addReg(EvenDst,
+                             getDefRegState(true) | getDeadRegState(DstIsDead))
+                     .addReg(EvenSrc, getKillRegState(SrcIsKill)));
       MachineInstrBuilder Odd =
         AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
                                TII->get(ARM::VMOVQ))
-                       .addReg(OddDst, getDefRegState(true) | getDeadRegState(DstIsDead))
-                       .addReg(OddSrc, getKillRegState(SrcIsKill)));
+                     .addReg(OddDst,
+                             getDefRegState(true) | getDeadRegState(DstIsDead))
+                     .addReg(OddSrc, getKillRegState(SrcIsKill)));
       TransferImpOps(MI, Even, Odd);
       MI.eraseFromParent();
       Modified = true;
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 9baef6bf1243..c84d3ff81324 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "arm-isel"
 #include "ARM.h"
 #include "ARMAddressingModes.h"
 #include "ARMTargetMachine.h"
@@ -35,11 +36,6 @@
 
 using namespace llvm;
 
-static cl::opt<bool>
-UseRegSeq("neon-reg-sequence", cl::Hidden,
-          cl::desc("Use reg_sequence to model ld / st of multiple neon regs"),
-          cl::init(true));
-
 //===--------------------------------------------------------------------===//
 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
 /// instructions for SelectionDAG operations.
@@ -147,6 +143,11 @@ private:
                           unsigned *DOpcodes, unsigned *QOpcodes0,
                           unsigned *QOpcodes1);
 
+  /// SelectVTBL - Select NEON VTBL and VTBX intrinsics.  NumVecs should be 2,
+  /// 3 or 4.  These are custom-selected so that a REG_SEQUENCE can be
+  /// generated to force the table registers to be consecutive.
+  SDNode *SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc);
+
   /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM.
   SDNode *SelectV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
 
@@ -173,24 +174,17 @@ private:
                                             char ConstraintCode,
                                             std::vector<SDValue> &OutOps);
 
-  /// PairDRegs - Form a quad register from a pair of D registers.
-  ///
+  // Form pairs of consecutive S, D, or Q registers.
+  SDNode *PairSRegs(EVT VT, SDValue V0, SDValue V1);
   SDNode *PairDRegs(EVT VT, SDValue V0, SDValue V1);
-
-  /// PairDRegs - Form a quad register pair from a pair of Q registers.
-  ///
   SDNode *PairQRegs(EVT VT, SDValue V0, SDValue V1);
 
-  /// QuadDRegs - Form a quad register pair from a quad of D registers.
-  ///
+  // Form sequences of 4 consecutive S, D, or Q registers.
+  SDNode *QuadSRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
   SDNode *QuadDRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
-
-  /// QuadQRegs - Form 4 consecutive Q registers.
-  ///
   SDNode *QuadQRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
 
-  /// OctoDRegs - Form 8 consecutive D registers.
-  ///
+  // Form sequences of 8 consecutive D registers.
   SDNode *OctoDRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3,
                     SDValue V4, SDValue V5, SDValue V6, SDValue V7);
 };
@@ -544,10 +538,9 @@ bool ARMDAGToDAGISel::SelectAddrModePC(SDNode *Op, SDValue N,
 bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDNode *Op, SDValue N,
                                             SDValue &Base, SDValue &Offset){
   // FIXME dl should come from the parent load or store, not the address
-  DebugLoc dl = Op->getDebugLoc();
   if (N.getOpcode() != ISD::ADD) {
     ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N);
-    if (!NC || NC->getZExtValue() != 0)
+    if (!NC || !NC->isNullValue())
       return false;
 
     Base = Offset = N;
@@ -788,8 +781,9 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8s4(SDNode *Op, SDValue N,
   if (N.getOpcode() == ISD::ADD) {
     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
       int RHSC = (int)RHS->getZExtValue();
+      // 8 bits.
       if (((RHSC & 0x3) == 0) &&
-          ((RHSC >= 0 && RHSC < 0x400) || (RHSC < 0 && RHSC > -0x400))) { // 8 bits.
+          ((RHSC >= 0 && RHSC < 0x400) || (RHSC < 0 && RHSC > -0x400))) {
         Base   = N.getOperand(0);
         OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
         return true;
@@ -798,7 +792,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8s4(SDNode *Op, SDValue N,
   } else if (N.getOpcode() == ISD::SUB) {
     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
       int RHSC = (int)RHS->getZExtValue();
-      if (((RHSC & 0x3) == 0) && (RHSC >= 0 && RHSC < 0x400)) { // 8 bits.
+      // 8 bits.
+      if (((RHSC & 0x3) == 0) && (RHSC >= 0 && RHSC < 0x400)) {
         Base   = N.getOperand(0);
         OffImm = CurDAG->getTargetConstant(-RHSC, MVT::i32);
         return true;
@@ -960,22 +955,24 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
   return NULL;
 }
 
+/// PairSRegs - Form a D register from a pair of S registers.
+///
+SDNode *ARMDAGToDAGISel::PairSRegs(EVT VT, SDValue V0, SDValue V1) {
+  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, MVT::i32);
+  const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4);
+}
+
 /// PairDRegs - Form a quad register from a pair of D registers.
 ///
 SDNode *ARMDAGToDAGISel::PairDRegs(EVT VT, SDValue V0, SDValue V1) {
   DebugLoc dl = V0.getNode()->getDebugLoc();
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32);
   SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32);
-  if (llvm::ModelWithRegSequence()) {
-    const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 };
-    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4);
-  }
-  SDValue Undef =
-    SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0);
-  SDNode *Pair = CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
-                                        VT, Undef, V0, SubReg0);
-  return CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
-                                VT, SDValue(Pair, 0), V1, SubReg1);
+  const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4);
 }
 
 /// PairQRegs - Form 4 consecutive D registers from a pair of Q registers.
@@ -988,6 +985,19 @@ SDNode *ARMDAGToDAGISel::PairQRegs(EVT VT, SDValue V0, SDValue V1) {
   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4);
 }
 
+/// QuadSRegs - Form 4 consecutive S registers.
+///
+SDNode *ARMDAGToDAGISel::QuadSRegs(EVT VT, SDValue V0, SDValue V1,
+                                   SDValue V2, SDValue V3) {
+  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, MVT::i32);
+  SDValue SubReg2 = CurDAG->getTargetConstant(ARM::ssub_2, MVT::i32);
+  SDValue SubReg3 = CurDAG->getTargetConstant(ARM::ssub_3, MVT::i32);
+  const SDValue Ops[] = { V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 8);
+}
+
 /// QuadDRegs - Form 4 consecutive D registers.
 ///
 SDNode *ARMDAGToDAGISel::QuadDRegs(EVT VT, SDValue V0, SDValue V1,
@@ -1088,7 +1098,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
     std::vector<EVT> ResTys(NumVecs, VT);
     ResTys.push_back(MVT::Other);
     SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 5);
-    if (!llvm::ModelWithRegSequence() || NumVecs < 2)
+    if (NumVecs < 2)
       return VLd;
 
     SDValue RegSeq;
@@ -1129,24 +1139,17 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
     Chain = SDValue(VLd, 2 * NumVecs);
 
     // Combine the even and odd subregs to produce the result.
-    if (llvm::ModelWithRegSequence()) {
-      if (NumVecs == 1) {
-        SDNode *Q = PairDRegs(VT, SDValue(VLd, 0), SDValue(VLd, 1));
-        ReplaceUses(SDValue(N, 0), SDValue(Q, 0));
-      } else {
-        SDValue QQ = SDValue(QuadDRegs(MVT::v4i64,
-                                       SDValue(VLd, 0), SDValue(VLd, 1),
-                                       SDValue(VLd, 2), SDValue(VLd, 3)), 0);
-        SDValue Q0 = CurDAG->getTargetExtractSubreg(ARM::qsub_0, dl, VT, QQ);
-        SDValue Q1 = CurDAG->getTargetExtractSubreg(ARM::qsub_1, dl, VT, QQ);
-        ReplaceUses(SDValue(N, 0), Q0);
-        ReplaceUses(SDValue(N, 1), Q1);
-      }
+    if (NumVecs == 1) {
+      SDNode *Q = PairDRegs(VT, SDValue(VLd, 0), SDValue(VLd, 1));
+      ReplaceUses(SDValue(N, 0), SDValue(Q, 0));
     } else {
-      for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
-        SDNode *Q = PairDRegs(VT, SDValue(VLd, 2*Vec), SDValue(VLd, 2*Vec+1));
-        ReplaceUses(SDValue(N, Vec), SDValue(Q, 0));
-      }
+      SDValue QQ = SDValue(QuadDRegs(MVT::v4i64,
+                                     SDValue(VLd, 0), SDValue(VLd, 1),
+                                     SDValue(VLd, 2), SDValue(VLd, 3)), 0);
+      SDValue Q0 = CurDAG->getTargetExtractSubreg(ARM::qsub_0, dl, VT, QQ);
+      SDValue Q1 = CurDAG->getTargetExtractSubreg(ARM::qsub_1, dl, VT, QQ);
+      ReplaceUses(SDValue(N, 0), Q0);
+      ReplaceUses(SDValue(N, 1), Q1);
     }
   } else {
     // Otherwise, quad registers are loaded with two separate instructions,
@@ -1169,37 +1172,27 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
     SDNode *VLdB = CurDAG->getMachineNode(Opc, dl, ResTys, OpsB, 6);
     Chain = SDValue(VLdB, NumVecs+1);
 
-    if (llvm::ModelWithRegSequence()) {
-      SDValue V0 = SDValue(VLdA, 0);
-      SDValue V1 = SDValue(VLdB, 0);
-      SDValue V2 = SDValue(VLdA, 1);
-      SDValue V3 = SDValue(VLdB, 1);
-      SDValue V4 = SDValue(VLdA, 2);
-      SDValue V5 = SDValue(VLdB, 2);
-      SDValue V6 = (NumVecs == 3)
-          ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,RegVT),
-                    0)
-          : SDValue(VLdA, 3);
-      SDValue V7 = (NumVecs == 3)
-          ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,RegVT),
-                    0)
-          : SDValue(VLdB, 3);
-      SDValue RegSeq = SDValue(OctoDRegs(MVT::v8i64, V0, V1, V2, V3,
-                                         V4, V5, V6, V7), 0);
-
-      // Extract out the 3 / 4 Q registers.
-      assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
-      for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
-        SDValue Q = CurDAG->getTargetExtractSubreg(ARM::qsub_0+Vec,
-                                                   dl, VT, RegSeq);
-        ReplaceUses(SDValue(N, Vec), Q);
-      }
-    } else {
-      // Combine the even and odd subregs to produce the result.
-      for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
-        SDNode *Q = PairDRegs(VT, SDValue(VLdA, Vec), SDValue(VLdB, Vec));
-        ReplaceUses(SDValue(N, Vec), SDValue(Q, 0));
-      }
+    SDValue V0 = SDValue(VLdA, 0);
+    SDValue V1 = SDValue(VLdB, 0);
+    SDValue V2 = SDValue(VLdA, 1);
+    SDValue V3 = SDValue(VLdB, 1);
+    SDValue V4 = SDValue(VLdA, 2);
+    SDValue V5 = SDValue(VLdB, 2);
+    SDValue V6 = (NumVecs == 3)
+      ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,RegVT), 0)
+      : SDValue(VLdA, 3);
+    SDValue V7 = (NumVecs == 3)
+      ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,RegVT), 0)
+      : SDValue(VLdB, 3);
+    SDValue RegSeq = SDValue(OctoDRegs(MVT::v8i64, V0, V1, V2, V3,
+                                       V4, V5, V6, V7), 0);
+
+    // Extract out the 3 / 4 Q registers.
+    assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
+    for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
+      SDValue Q = CurDAG->getTargetExtractSubreg(ARM::qsub_0+Vec,
+                                                 dl, VT, RegSeq);
+      ReplaceUses(SDValue(N, Vec), Q);
     }
   }
   ReplaceUses(SDValue(N, NumVecs), Chain);
@@ -1209,7 +1202,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
 SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
                                    unsigned *DOpcodes, unsigned *QOpcodes0,
                                    unsigned *QOpcodes1) {
-  assert(NumVecs >=1 && NumVecs <= 4 && "VST NumVecs out-of-range");
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
   DebugLoc dl = N->getDebugLoc();
 
   SDValue MemAddr, Align;
@@ -1247,7 +1240,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
   Ops.push_back(Align);
 
   if (is64BitVector) {
-    if (llvm::ModelWithRegSequence() && NumVecs >= 2) {
+    if (NumVecs >= 2) {
       SDValue RegSeq;
       SDValue V0 = N->getOperand(0+3);
       SDValue V1 = N->getOperand(1+3);
@@ -1292,7 +1285,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
     // Quad registers are directly supported for VST1 and VST2,
     // storing pairs of D regs.
     unsigned Opc = QOpcodes0[OpcodeIndex];
-    if (llvm::ModelWithRegSequence() && NumVecs == 2) {
+    if (NumVecs == 2) {
       // First extract the pair of Q registers.
       SDValue Q0 = N->getOperand(3);
       SDValue Q1 = N->getOperand(4);
@@ -1330,76 +1323,48 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
 
   // Otherwise, quad registers are stored with two separate instructions,
   // where one stores the even registers and the other stores the odd registers.
-  if (llvm::ModelWithRegSequence()) {
-    // Form the QQQQ REG_SEQUENCE.
-    SDValue V[8];
-    for (unsigned Vec = 0, i = 0; Vec < NumVecs; ++Vec, i+=2) {
-      V[i]   = CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, RegVT,
-                                              N->getOperand(Vec+3));
-      V[i+1] = CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, RegVT,
-                                              N->getOperand(Vec+3));
-    }
-    if (NumVecs == 3)
-      V[6] = V[7] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                   dl, RegVT), 0);
-
-    SDValue RegSeq = SDValue(OctoDRegs(MVT::v8i64, V[0], V[1], V[2], V[3],
-                                       V[4], V[5], V[6], V[7]), 0);
-
-    // Store the even D registers.
-    assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
-    Ops.push_back(Reg0); // post-access address offset
-    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0+Vec*2, dl,
-                                                   RegVT, RegSeq));
-    Ops.push_back(Pred);
-    Ops.push_back(Reg0); // predicate register
-    Ops.push_back(Chain);
-    unsigned Opc = QOpcodes0[OpcodeIndex];
-    SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
-                                          MVT::Other, Ops.data(), NumVecs+6);
-    Chain = SDValue(VStA, 1);
 
-    // Store the odd D registers.
-    Ops[0] = SDValue(VStA, 0); // MemAddr
-    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-      Ops[Vec+3] = CurDAG->getTargetExtractSubreg(ARM::dsub_1+Vec*2, dl,
-                                                  RegVT, RegSeq);
-    Ops[NumVecs+5] = Chain;
-    Opc = QOpcodes1[OpcodeIndex];
-    SDNode *VStB = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
-                                          MVT::Other, Ops.data(), NumVecs+6);
-    Chain = SDValue(VStB, 1);
-    ReplaceUses(SDValue(N, 0), Chain);
-    return NULL;
-  } else {
-    Ops.push_back(Reg0); // post-access address offset
-
-    // Store the even subregs.
-    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, RegVT,
-                                                   N->getOperand(Vec+3)));
-    Ops.push_back(Pred);
-    Ops.push_back(Reg0); // predicate register
-    Ops.push_back(Chain);
-    unsigned Opc = QOpcodes0[OpcodeIndex];
-    SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
-                                          MVT::Other, Ops.data(), NumVecs+6);
-    Chain = SDValue(VStA, 1);
-
-    // Store the odd subregs.
-    Ops[0] = SDValue(VStA, 0); // MemAddr
-    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-      Ops[Vec+3] = CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, RegVT,
-                                                  N->getOperand(Vec+3));
-    Ops[NumVecs+5] = Chain;
-    Opc = QOpcodes1[OpcodeIndex];
-    SDNode *VStB = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
-                                          MVT::Other, Ops.data(), NumVecs+6);
-    Chain = SDValue(VStB, 1);
-    ReplaceUses(SDValue(N, 0), Chain);
-    return NULL;
-  }
+  // Form the QQQQ REG_SEQUENCE.
+  SDValue V[8];
+  for (unsigned Vec = 0, i = 0; Vec < NumVecs; ++Vec, i+=2) {
+    V[i]   = CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, RegVT,
+                                            N->getOperand(Vec+3));
+    V[i+1] = CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, RegVT,
+                                            N->getOperand(Vec+3));
+  }
+  if (NumVecs == 3)
+    V[6] = V[7] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                 dl, RegVT), 0);
+
+  SDValue RegSeq = SDValue(OctoDRegs(MVT::v8i64, V[0], V[1], V[2], V[3],
+                                     V[4], V[5], V[6], V[7]), 0);
+
+  // Store the even D registers.
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  Ops.push_back(Reg0); // post-access address offset
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0+Vec*2, dl,
+                                                 RegVT, RegSeq));
+  Ops.push_back(Pred);
+  Ops.push_back(Reg0); // predicate register
+  Ops.push_back(Chain);
+  unsigned Opc = QOpcodes0[OpcodeIndex];
+  SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
+                                        MVT::Other, Ops.data(), NumVecs+6);
+  Chain = SDValue(VStA, 1);
+
+  // Store the odd D registers.
+  Ops[0] = SDValue(VStA, 0); // MemAddr
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    Ops[Vec+3] = CurDAG->getTargetExtractSubreg(ARM::dsub_1+Vec*2, dl,
+                                                RegVT, RegSeq);
+  Ops[NumVecs+5] = Chain;
+  Opc = QOpcodes1[OpcodeIndex];
+  SDNode *VStB = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
+                                        MVT::Other, Ops.data(), NumVecs+6);
+  Chain = SDValue(VStB, 1);
+  ReplaceUses(SDValue(N, 0), Chain);
+  return NULL;
 }
 
 SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
@@ -1421,13 +1386,11 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
 
   // Quad registers are handled by load/store of subregs. Find the subreg info.
   unsigned NumElts = 0;
-  int SubregIdx = 0;
   bool Even = false;
   EVT RegVT = VT;
   if (!is64BitVector) {
     RegVT = GetNEONSubregVT(VT);
     NumElts = RegVT.getVectorNumElements();
-    SubregIdx = (Lane < NumElts) ? ARM::dsub_0 : ARM::dsub_1;
     Even = Lane < NumElts;
   }
 
@@ -1455,35 +1418,26 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
   unsigned Opc = 0;
   if (is64BitVector) {
     Opc = DOpcodes[OpcodeIndex];
-    if (llvm::ModelWithRegSequence()) {
-      SDValue RegSeq;
-      SDValue V0 = N->getOperand(0+3);
-      SDValue V1 = N->getOperand(1+3);
-      if (NumVecs == 2) {
-        RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0);
-      } else {
-        SDValue V2 = N->getOperand(2+3);
-        SDValue V3 = (NumVecs == 3)
-          ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
-          : N->getOperand(3+3);
-        RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
-      }
-
-      // Now extract the D registers back out.
-      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT,
-                                                   RegSeq));
-      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, VT,
-                                                   RegSeq));
-      if (NumVecs > 2)
-        Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_2, dl, VT,
-                                                     RegSeq));
-      if (NumVecs > 3)
-        Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT,
-                                                     RegSeq));
+    SDValue RegSeq;
+    SDValue V0 = N->getOperand(0+3);
+    SDValue V1 = N->getOperand(1+3);
+    if (NumVecs == 2) {
+      RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0);
     } else {
-      for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-        Ops.push_back(N->getOperand(Vec+3));
+      SDValue V2 = N->getOperand(2+3);
+      SDValue V3 = (NumVecs == 3)
+        ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
+        : N->getOperand(3+3);
+      RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
     }
+
+    // Now extract the D registers back out.
+    Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT, RegSeq));
+    Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, VT, RegSeq));
+    if (NumVecs > 2)
+      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_2, dl, VT,RegSeq));
+    if (NumVecs > 3)
+      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT,RegSeq));
   } else {
     // Check if this is loading the even or odd subreg of a Q register.
     if (Lane < NumElts) {
@@ -1493,31 +1447,24 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
       Opc = QOpcodes1[OpcodeIndex];
     }
 
-    if (llvm::ModelWithRegSequence()) {
-      SDValue RegSeq;
-      SDValue V0 = N->getOperand(0+3);
-      SDValue V1 = N->getOperand(1+3);
-      if (NumVecs == 2) {
-        RegSeq = SDValue(PairQRegs(MVT::v4i64, V0, V1), 0);
-      } else {
-        SDValue V2 = N->getOperand(2+3);
-        SDValue V3 = (NumVecs == 3)
-          ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
-          : N->getOperand(3+3);
-        RegSeq = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0);
-      }
-
-      // Extract the subregs of the input vector.
-      unsigned SubIdx = Even ? ARM::dsub_0 : ARM::dsub_1;
-      for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-        Ops.push_back(CurDAG->getTargetExtractSubreg(SubIdx+Vec*2, dl, RegVT,
-                                                     RegSeq));
+    SDValue RegSeq;
+    SDValue V0 = N->getOperand(0+3);
+    SDValue V1 = N->getOperand(1+3);
+    if (NumVecs == 2) {
+      RegSeq = SDValue(PairQRegs(MVT::v4i64, V0, V1), 0);
     } else {
-      // Extract the subregs of the input vector.
-      for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-        Ops.push_back(CurDAG->getTargetExtractSubreg(SubregIdx, dl, RegVT,
-                                                     N->getOperand(Vec+3)));
+      SDValue V2 = N->getOperand(2+3);
+      SDValue V3 = (NumVecs == 3)
+        ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
+        : N->getOperand(3+3);
+      RegSeq = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0);
     }
+
+    // Extract the subregs of the input vector.
+    unsigned SubIdx = Even ? ARM::dsub_0 : ARM::dsub_1;
+    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+      Ops.push_back(CurDAG->getTargetExtractSubreg(SubIdx+Vec*2, dl, RegVT,
+                                                   RegSeq));
   }
   Ops.push_back(getI32Imm(Lane));
   Ops.push_back(Pred);
@@ -1531,76 +1478,97 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
   ResTys.push_back(MVT::Other);
   SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(),NumVecs+6);
 
-  if (llvm::ModelWithRegSequence()) {
-    // Form a REG_SEQUENCE to force register allocation.
-    SDValue RegSeq;
-    if (is64BitVector) {
-      SDValue V0 = SDValue(VLdLn, 0);
-      SDValue V1 = SDValue(VLdLn, 1);
-      if (NumVecs == 2) {
-        RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0);
-      } else {
-        SDValue V2 = SDValue(VLdLn, 2);
-        // If it's a vld3, form a quad D-register but discard the last part.
-        SDValue V3 = (NumVecs == 3)
-          ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
-          : SDValue(VLdLn, 3);
-        RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
-      }
+  // Form a REG_SEQUENCE to force register allocation.
+  SDValue RegSeq;
+  if (is64BitVector) {
+    SDValue V0 = SDValue(VLdLn, 0);
+    SDValue V1 = SDValue(VLdLn, 1);
+    if (NumVecs == 2) {
+      RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0);
     } else {
-      // For 128-bit vectors, take the 64-bit results of the load and insert them
-      // as subregs into the result.
-      SDValue V[8];
-      for (unsigned Vec = 0, i = 0; Vec < NumVecs; ++Vec, i+=2) {
-        if (Even) {
-          V[i]   = SDValue(VLdLn, Vec);
-          V[i+1] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                  dl, RegVT), 0);
-        } else {
-          V[i]   = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                  dl, RegVT), 0);
-          V[i+1] = SDValue(VLdLn, Vec);
-        }
+      SDValue V2 = SDValue(VLdLn, 2);
+      // If it's a vld3, form a quad D-register but discard the last part.
+      SDValue V3 = (NumVecs == 3)
+        ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
+        : SDValue(VLdLn, 3);
+      RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
+    }
+  } else {
+    // For 128-bit vectors, take the 64-bit results of the load and insert
+    // them as subregs into the result.
+    SDValue V[8];
+    for (unsigned Vec = 0, i = 0; Vec < NumVecs; ++Vec, i+=2) {
+      if (Even) {
+        V[i]   = SDValue(VLdLn, Vec);
+        V[i+1] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                dl, RegVT), 0);
+      } else {
+        V[i]   = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                dl, RegVT), 0);
+        V[i+1] = SDValue(VLdLn, Vec);
       }
-      if (NumVecs == 3)
-        V[6] = V[7] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                     dl, RegVT), 0);
-
-      if (NumVecs == 2)
-        RegSeq = SDValue(QuadDRegs(MVT::v4i64, V[0], V[1], V[2], V[3]), 0);
-      else
-        RegSeq = SDValue(OctoDRegs(MVT::v8i64, V[0], V[1], V[2], V[3],
-                                   V[4], V[5], V[6], V[7]), 0);
     }
+    if (NumVecs == 3)
+      V[6] = V[7] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                   dl, RegVT), 0);
 
-    assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
-    assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
-    unsigned SubIdx = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
-    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-      ReplaceUses(SDValue(N, Vec),
-                  CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, RegSeq));
-    ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, NumVecs));
-    return NULL;
-  }
-
-  // For a 64-bit vector load to D registers, nothing more needs to be done.
-  if (is64BitVector)
-    return VLdLn;
-
-  // For 128-bit vectors, take the 64-bit results of the load and insert them
-  // as subregs into the result.
-  for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
-    SDValue QuadVec = CurDAG->getTargetInsertSubreg(SubregIdx, dl, VT,
-                                                    N->getOperand(Vec+3),
-                                                    SDValue(VLdLn, Vec));
-    ReplaceUses(SDValue(N, Vec), QuadVec);
+    if (NumVecs == 2)
+      RegSeq = SDValue(QuadDRegs(MVT::v4i64, V[0], V[1], V[2], V[3]), 0);
+    else
+      RegSeq = SDValue(OctoDRegs(MVT::v8i64, V[0], V[1], V[2], V[3],
+                                 V[4], V[5], V[6], V[7]), 0);
   }
 
-  Chain = SDValue(VLdLn, NumVecs);
-  ReplaceUses(SDValue(N, NumVecs), Chain);
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
+  unsigned SubIdx = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    ReplaceUses(SDValue(N, Vec),
+                CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, RegSeq));
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, NumVecs));
   return NULL;
 }
 
+SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
+                                    unsigned Opc) {
+  assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range");
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+  unsigned FirstTblReg = IsExt ? 2 : 1;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SDValue RegSeq;
+  SDValue V0 = N->getOperand(FirstTblReg + 0);
+  SDValue V1 = N->getOperand(FirstTblReg + 1);
+  if (NumVecs == 2)
+    RegSeq = SDValue(PairDRegs(MVT::v16i8, V0, V1), 0);
+  else {
+    SDValue V2 = N->getOperand(FirstTblReg + 2);
+    // If it's a vtbl3, form a quad D-register and leave the last part as 
+    // an undef.
+    SDValue V3 = (NumVecs == 3)
+      ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
+      : N->getOperand(FirstTblReg + 3);
+    RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
+  }
+
+  // Now extract the D registers back out.
+  SmallVector<SDValue, 6> Ops;
+  if (IsExt)
+    Ops.push_back(N->getOperand(1));
+  Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT, RegSeq));
+  Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, VT, RegSeq));
+  if (NumVecs > 2)
+    Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_2, dl, VT, RegSeq));
+  if (NumVecs > 3)
+    Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT, RegSeq));
+
+  Ops.push_back(N->getOperand(FirstTblReg + NumVecs));
+  Ops.push_back(getAL(CurDAG)); // predicate
+  Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // predicate register
+  return CurDAG->getMachineNode(Opc, dl, VT, Ops.data(), Ops.size());
+}
+
 SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
                                                      bool isSigned) {
   if (!Subtarget->hasV6T2Ops())
@@ -1954,8 +1922,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, MVT::i32);
         SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
         if (Subtarget->isThumb()) {
-          SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops, 5);
+          SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops, 6);
         } else {
           SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
           return CurDAG->SelectNodeTo(N, ARM::RSBrs, MVT::i32, Ops, 7);
@@ -2015,7 +1983,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops,4);
+      return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32,Ops,4);
     } else {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
@@ -2029,7 +1997,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     if (Subtarget->isThumb()) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops,4);
+      return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32,Ops,4);
     } else {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
@@ -2211,6 +2179,22 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
     return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops, 4);
   }
+  case ARMISD::BUILD_VECTOR: {
+    EVT VecVT = N->getValueType(0);
+    EVT EltVT = VecVT.getVectorElementType();
+    unsigned NumElts = VecVT.getVectorNumElements();
+    if (EltVT.getSimpleVT() == MVT::f64) {
+      assert(NumElts == 2 && "unexpected type for BUILD_VECTOR");
+      return PairDRegs(VecVT, N->getOperand(0), N->getOperand(1));
+    }
+    assert(EltVT.getSimpleVT() == MVT::f32 &&
+           "unexpected type for BUILD_VECTOR");
+    if (NumElts == 2)
+      return PairSRegs(VecVT, N->getOperand(0), N->getOperand(1));
+    assert(NumElts == 4 && "unexpected type for BUILD_VECTOR");
+    return QuadSRegs(VecVT, N->getOperand(0), N->getOperand(1),
+                     N->getOperand(2), N->getOperand(3));
+  }
 
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN: {
@@ -2342,6 +2326,29 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     break;
   }
 
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+
+    case Intrinsic::arm_neon_vtbl2:
+      return SelectVTBL(N, false, 2, ARM::VTBL2);
+    case Intrinsic::arm_neon_vtbl3:
+      return SelectVTBL(N, false, 3, ARM::VTBL3);
+    case Intrinsic::arm_neon_vtbl4:
+      return SelectVTBL(N, false, 4, ARM::VTBL4);
+
+    case Intrinsic::arm_neon_vtbx2:
+      return SelectVTBL(N, true, 2, ARM::VTBX2);
+    case Intrinsic::arm_neon_vtbx3:
+      return SelectVTBL(N, true, 3, ARM::VTBX3);
+    case Intrinsic::arm_neon_vtbx4:
+      return SelectVTBL(N, true, 4, ARM::VTBX4);
+    }
+    break;
+  }
+
   case ISD::CONCAT_VECTORS:
     return SelectConcatVector(N);
   }
@@ -2367,9 +2374,3 @@ FunctionPass *llvm::createARMISelDag(ARMBaseTargetMachine &TM,
                                      CodeGenOpt::Level OptLevel) {
   return new ARMDAGToDAGISel(TM, OptLevel);
 }
-
-/// ModelWithRegSequence - Return true if isel should use REG_SEQUENCE to model
-/// operations involving sub-registers.
-bool llvm::ModelWithRegSequence() {
-  return UseRegSeq;
-}
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index b8126a3c5d18..98d8b8585428 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "arm-isel"
 #include "ARM.h"
 #include "ARMAddressingModes.h"
 #include "ARMConstantPoolValue.h"
@@ -40,6 +41,7 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/ADT/VectorExtras.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -47,9 +49,27 @@
 #include <sstream>
 using namespace llvm;
 
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+// This option should go away when tail calls fully work.
+static cl::opt<bool>
+EnableARMTailCalls("arm-tail-calls", cl::Hidden,
+  cl::desc("Generate tail calls (TEMPORARY OPTION)."),
+  cl::init(true));
+
 static cl::opt<bool>
 EnableARMLongCalls("arm-long-calls", cl::Hidden,
-  cl::desc("Generate calls via indirect call instructions."),
+  cl::desc("Generate calls via indirect call instructions"),
+  cl::init(false));
+
+static cl::opt<bool>
+ARMInterworking("arm-interworking", cl::Hidden,
+  cl::desc("Enable / disable ARM interworking (for debugging only)"),
+  cl::init(true));
+
+static cl::opt<bool>
+EnableARMCodePlacement("arm-code-placement", cl::Hidden,
+  cl::desc("Enable code placement pass for ARM"),
   cl::init(false));
 
 static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
@@ -94,10 +114,7 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
   }
   setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
-  if (llvm::ModelWithRegSequence())
-    setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
-  else
-    setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Expand);
   setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
   setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
@@ -393,13 +410,57 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   // doesn't yet know how to not do that for SjLj.
   setExceptionSelectorRegister(ARM::R0);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
-  setOperationAction(ISD::MEMBARRIER,         MVT::Other, Custom);
-
-  // If the subtarget does not have extract instructions, sign_extend_inreg
-  // needs to be expanded. Extract is available in ARM mode on v6 and up,
-  // and on most Thumb2 implementations.
-  if ((!Subtarget->isThumb() && !Subtarget->hasV6Ops())
-      || (Subtarget->isThumb2() && !Subtarget->hasT2ExtractPack())) {
+  // Handle atomics directly for ARMv[67] (except for Thumb1), otherwise
+  // use the default expansion.
+  bool canHandleAtomics =
+    (Subtarget->hasV7Ops() ||
+      (Subtarget->hasV6Ops() && !Subtarget->isThumb1Only()));
+  if (canHandleAtomics) {
+    // membarrier needs custom lowering; the rest are legal and handled
+    // normally.
+    setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom);
+  } else {
+    // Set them all for expansion, which will force libcalls.
+    setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
+    // Since the libcalls include locking, fold in the fences
+    setShouldFoldAtomicFences(true);
+  }
+  // 64-bit versions are always libcalls (for now)
+  setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_SWAP,      MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Expand);
+
+  // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
+  if (!Subtarget->hasV6Ops()) {
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
   }
@@ -412,8 +473,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
 
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
-  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
-  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+  if (Subtarget->isTargetDarwin()) {
+    setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+    setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+  }
 
   setOperationAction(ISD::SETCC,     MVT::i32, Expand);
   setOperationAction(ISD::SETCC,     MVT::f32, Expand);
@@ -474,28 +537,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   else
     setSchedulingPreference(Sched::Hybrid);
 
-  // FIXME: If-converter should use instruction latency to determine
-  // profitability rather than relying on fixed limits.
-  if (Subtarget->getCPUString() == "generic") {
-    // Generic (and overly aggressive) if-conversion limits.
-    setIfCvtBlockSizeLimit(10);
-    setIfCvtDupBlockSizeLimit(2);
-  } else if (Subtarget->hasV7Ops()) {
-    setIfCvtBlockSizeLimit(3);
-    setIfCvtDupBlockSizeLimit(1);
-  } else if (Subtarget->hasV6Ops()) {
-    setIfCvtBlockSizeLimit(2);
-    setIfCvtDupBlockSizeLimit(1);
-  } else {
-    setIfCvtBlockSizeLimit(3);
-    setIfCvtDupBlockSizeLimit(2);
-  }
-
   maxStoresPerMemcpy = 1;   //// temporary - rewrite interface to use type
-  // Do not enable CodePlacementOpt for now: it currently runs after the
-  // ARMConstantIslandPass and messes up branch relaxation and placement
-  // of constant islands.
-  // benefitFromCodePlacementOpt = true;
+
+  // On ARM arguments smaller than 4 bytes are extended, so all arguments
+  // are at least 4 bytes aligned.
+  setMinStackArgumentAlignment(4);
+
+  if (EnableARMCodePlacement)
+    benefitFromCodePlacementOpt = true;
 }
 
 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -537,6 +586,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
   case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP";
 
+  case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
+  
   case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
 
   case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
@@ -581,6 +632,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VZIP:          return "ARMISD::VZIP";
   case ARMISD::VUZP:          return "ARMISD::VUZP";
   case ARMISD::VTRN:          return "ARMISD::VTRN";
+  case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
   case ARMISD::FMAX:          return "ARMISD::FMAX";
   case ARMISD::FMIN:          return "ARMISD::FMIN";
   }
@@ -603,15 +655,33 @@ TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const {
 
 /// getFunctionAlignment - Return the Log2 alignment of this function.
 unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const {
-  return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 0 : 1;
+  return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 1 : 2;
 }
 
 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
-  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+  unsigned NumVals = N->getNumValues();
+  if (!NumVals)
+    return Sched::RegPressure;
+
+  for (unsigned i = 0; i != NumVals; ++i) {
     EVT VT = N->getValueType(i);
     if (VT.isFloatingPoint() || VT.isVector())
       return Sched::Latency;
   }
+
+  if (!N->isMachineOpcode())
+    return Sched::RegPressure;
+
+  // Load are scheduled for latency even if there instruction itinerary
+  // is not available.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrDesc &TID = TII->get(N->getMachineOpcode());
+  if (TID.mayLoad())
+    return Sched::Latency;
+
+  const InstrItineraryData &Itins = getTargetMachine().getInstrItineraryData();
+  if (!Itins.isEmpty() && Itins.getStageLatency(TID.getSchedClass()) > 2)
+    return Sched::Latency;
   return Sched::RegPressure;
 }
 
@@ -964,11 +1034,28 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                              CallingConv::ID CallConv, bool isVarArg,
                              bool &isTailCall,
                              const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<SDValue> &OutVals,
                              const SmallVectorImpl<ISD::InputArg> &Ins,
                              DebugLoc dl, SelectionDAG &DAG,
                              SmallVectorImpl<SDValue> &InVals) const {
-  // ARM target does not yet support tail call optimization.
-  isTailCall = false;
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+  bool IsSibCall = false;
+  // Temporarily disable tail calls so things don't break.
+  if (!EnableARMTailCalls)
+    isTailCall = false;
+  if (isTailCall) {
+    // Check if it's really possible to do a tail call.
+    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+                    isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
+                                                   Outs, OutVals, Ins, DAG);
+    // We don't support GuaranteedTailCallOpt for ARM, only automatically
+    // detected sibcalls.
+    if (isTailCall) {
+      ++NumTailCalls;
+      IsSibCall = true;
+    }
+  }
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -981,9 +1068,14 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
+  // For tail calls, memory operands are available in our caller's stack.
+  if (IsSibCall)
+    NumBytes = 0;
+
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  if (!IsSibCall)
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
 
   SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
 
@@ -996,7 +1088,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
        i != e;
        ++i, ++realArgIdx) {
     CCValAssign &VA = ArgLocs[i];
-    SDValue Arg = Outs[realArgIdx].Val;
+    SDValue Arg = OutVals[realArgIdx];
     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
 
     // Promote the value if needed.
@@ -1044,7 +1136,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       }
     } else if (VA.isRegLoc()) {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
-    } else {
+    } else if (!IsSibCall) {
       assert(VA.isMemLoc());
 
       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
@@ -1059,10 +1151,32 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
   SDValue InFlag;
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                             RegsToPass[i].second, InFlag);
-    InFlag = Chain.getValue(1);
+  // Tail call byval lowering might overwrite argument registers so in case of
+  // tail call optimization the copies to registers are lowered later.
+  if (!isTailCall)
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+
+  // For tail calls lower the arguments to the 'real' stack slot.
+  if (isTailCall) {
+    // Force all the incoming stack arguments to be loaded from the stack
+    // before any new outgoing arguments are stored to the stack, because the
+    // outgoing stack slots may alias the incoming argument stack slots, and
+    // the alias isn't otherwise explicit. This is slightly more conservative
+    // than necessary, because it means that each store effectively depends
+    // on every argument instead of just those arguments it would clobber.
+
+    // Do not flag preceeding copytoreg stuff together with the following stuff.
+    InFlag = SDValue();
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+    InFlag =SDValue();
   }
 
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
@@ -1071,7 +1185,6 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   bool isDirect = false;
   bool isARMFunc = false;
   bool isLocalARMFunc = false;
-  MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
   if (EnableARMLongCalls) {
@@ -1117,7 +1230,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                    getTargetMachine().getRelocationModel() != Reloc::Static;
     isARMFunc = !Subtarget->isThumb() || isStub;
     // ARM call to a local ARM function is predicable.
-    isLocalARMFunc = !Subtarget->isThumb() && !isExt;
+    isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking);
     // tBX takes a register source operand.
     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
       unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
@@ -1134,7 +1247,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
                            getPointerTy(), Callee, PICLabel);
     } else
-      Callee = DAG.getTargetGlobalAddress(GV, getPointerTy());
+      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     isDirect = true;
     bool isStub = Subtarget->isTargetDarwin() &&
@@ -1171,11 +1284,6 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       ? (isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL)
       : ARMISD::CALL_NOLINK;
   }
-  if (CallOpc == ARMISD::CALL_NOLINK && !Subtarget->isThumb1Only()) {
-    // implicit def LR - LR mustn't be allocated as GRP:$dst of CALL_NOLINK
-    Chain = DAG.getCopyToReg(Chain, dl, ARM::LR, DAG.getUNDEF(MVT::i32),InFlag);
-    InFlag = Chain.getValue(1);
-  }
 
   std::vector<SDValue> Ops;
   Ops.push_back(Chain);
@@ -1189,9 +1297,13 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   if (InFlag.getNode())
     Ops.push_back(InFlag);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  if (isTailCall)
+    return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+
   // Returns a chain and a flag for retval copy to use.
-  Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Flag),
-                      &Ops[0], Ops.size());
+  Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
   InFlag = Chain.getValue(1);
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
@@ -1205,10 +1317,203 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                          dl, DAG, InVals);
 }
 
+/// MatchingStackOffset - Return true if the given stack call argument is
+/// already available in the same position (relatively) of the caller's
+/// incoming argument stack.
+static
+bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
+                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
+                         const ARMInstrInfo *TII) {
+  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
+  int FI = INT_MAX;
+  if (Arg.getOpcode() == ISD::CopyFromReg) {
+    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+    if (!VR || TargetRegisterInfo::isPhysicalRegister(VR))
+      return false;
+    MachineInstr *Def = MRI->getVRegDef(VR);
+    if (!Def)
+      return false;
+    if (!Flags.isByVal()) {
+      if (!TII->isLoadFromStackSlot(Def, FI))
+        return false;
+    } else {
+      return false;
+    }
+  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
+    if (Flags.isByVal())
+      // ByVal argument is passed in as a pointer but it's now being
+      // dereferenced. e.g.
+      // define @foo(%struct.X* %A) {
+      //   tail call @bar(%struct.X* byval %A)
+      // }
+      return false;
+    SDValue Ptr = Ld->getBasePtr();
+    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+    if (!FINode)
+      return false;
+    FI = FINode->getIndex();
+  } else
+    return false;
+
+  assert(FI != INT_MAX);
+  if (!MFI->isFixedObjectIndex(FI))
+    return false;
+  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool
+ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+                                                     CallingConv::ID CalleeCC,
+                                                     bool isVarArg,
+                                                     bool isCalleeStructRet,
+                                                     bool isCallerStructRet,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                                     SelectionDAG& DAG) const {
+  const Function *CallerF = DAG.getMachineFunction().getFunction();
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  bool CCMatch = CallerCC == CalleeCC;
+
+  // Look for obvious safe cases to perform tail call optimization that do not
+  // require ABI changes. This is what gcc calls sibcall.
+
+  // Do not sibcall optimize vararg calls unless the call site is not passing
+  // any arguments.
+  if (isVarArg && !Outs.empty())
+    return false;
+
+  // Also avoid sibcall optimization if either caller or callee uses struct
+  // return semantics.
+  if (isCalleeStructRet || isCallerStructRet)
+    return false;
+
+  // FIXME: Completely disable sibcall for Thumb1 since Thumb1RegisterInfo::
+  // emitEpilogue is not ready for them.
+  // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take
+  // LR.  This means if we need to reload LR, it takes an extra instructions,
+  // which outweighs the value of the tail call; but here we don't know yet
+  // whether LR is going to be used.  Probably the right approach is to
+  // generate the tail call here and turn it back into CALL/RET in 
+  // emitEpilogue if LR is used.
+  if (Subtarget->isThumb1Only())
+    return false;
+
+  // For the moment, we can only do this to functions defined in this
+  // compilation, or to indirect calls.  A Thumb B to an ARM function,
+  // or vice versa, is not easily fixed up in the linker unlike BL.
+  // (We could do this by loading the address of the callee into a register;
+  // that is an extra instruction over the direct call and burns a register
+  // as well, so is not likely to be a win.)
+
+  // It might be safe to remove this restriction on non-Darwin.
+
+  // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
+  // but we need to make sure there are enough registers; the only valid
+  // registers are the 4 used for parameters.  We don't currently do this
+  // case.
+  if (isa<ExternalSymbolSDNode>(Callee))
+      return false;
+
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    if (GV->isDeclaration() || GV->isWeakForLinker())
+      return false;
+  }
+
+  // If the calling conventions do not match, then we'd better make sure the
+  // results are returned in the same way as what the caller expects.
+  if (!CCMatch) {
+    SmallVector<CCValAssign, 16> RVLocs1;
+    CCState CCInfo1(CalleeCC, false, getTargetMachine(),
+                    RVLocs1, *DAG.getContext());
+    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg));
+
+    SmallVector<CCValAssign, 16> RVLocs2;
+    CCState CCInfo2(CallerCC, false, getTargetMachine(),
+                    RVLocs2, *DAG.getContext());
+    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg));
+
+    if (RVLocs1.size() != RVLocs2.size())
+      return false;
+    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
+      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
+        return false;
+      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
+        return false;
+      if (RVLocs1[i].isRegLoc()) {
+        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
+          return false;
+      } else {
+        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
+          return false;
+      }
+    }
+  }
+
+  // If the callee takes no arguments then go on to check the results of the
+  // call.
+  if (!Outs.empty()) {
+    // Check if stack adjustment is needed. For now, do not do this if any
+    // argument is passed on the stack.
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(),
+                   ArgLocs, *DAG.getContext());
+    CCInfo.AnalyzeCallOperands(Outs,
+                               CCAssignFnForNode(CalleeCC, false, isVarArg));
+    if (CCInfo.getNextStackOffset()) {
+      MachineFunction &MF = DAG.getMachineFunction();
+
+      // Check if the arguments are already laid out in the right way as
+      // the caller's fixed stack objects.
+      MachineFrameInfo *MFI = MF.getFrameInfo();
+      const MachineRegisterInfo *MRI = &MF.getRegInfo();
+      const ARMInstrInfo *TII =
+        ((ARMTargetMachine&)getTargetMachine()).getInstrInfo();
+      for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
+           i != e;
+           ++i, ++realArgIdx) {
+        CCValAssign &VA = ArgLocs[i];
+        EVT RegVT = VA.getLocVT();
+        SDValue Arg = OutVals[realArgIdx];
+        ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+        if (VA.getLocInfo() == CCValAssign::Indirect)
+          return false;
+        if (VA.needsCustom()) {
+          // f64 and vector types are split into multiple registers or
+          // register/stack-slot combinations.  The types will not match
+          // the registers; give up on memory f64 refs until we figure
+          // out what to do about this.
+          if (!VA.isRegLoc())
+            return false;
+          if (!ArgLocs[++i].isRegLoc())
+            return false; 
+          if (RegVT == MVT::v2f64) {
+            if (!ArgLocs[++i].isRegLoc())
+              return false;
+            if (!ArgLocs[++i].isRegLoc())
+              return false;
+          }
+        } else if (!VA.isRegLoc()) {
+          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
+                                   MFI, MRI, TII))
+            return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
 SDValue
 ARMTargetLowering::LowerReturn(SDValue Chain,
                                CallingConv::ID CallConv, bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
                                DebugLoc dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of the return value to a location.
@@ -1239,7 +1544,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
 
-    SDValue Arg = Outs[realRVLocIdx].Val;
+    SDValue Arg = OutVals[realRVLocIdx];
 
     switch (VA.getLocInfo()) {
     default: llvm_unreachable("Unknown loc info!");
@@ -1477,7 +1782,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
     // pair. This is always cheaper.
     if (Subtarget->useMovt()) {
       return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
-                         DAG.getTargetGlobalAddress(GV, PtrVT));
+                         DAG.getTargetGlobalAddress(GV, dl, PtrVT));
     } else {
       SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
@@ -1552,9 +1857,7 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
 SDValue
 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
-  SDValue Val = Subtarget->isThumb() ?
-    DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::SP, MVT::i32) :
-    DAG.getConstant(0, MVT::i32);
+  SDValue Val = DAG.getConstant(0, MVT::i32);
   return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32, Op.getOperand(0),
                      Op.getOperand(1), Val);
 }
@@ -1568,8 +1871,7 @@ ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
-                                           const ARMSubtarget *Subtarget)
-                                             const {
+                                          const ARMSubtarget *Subtarget) const {
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   DebugLoc dl = Op.getDebugLoc();
   switch (IntNo) {
@@ -1597,7 +1899,6 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
                   PseudoSourceValue::getConstantPool(), 0,
                   false, false, 0);
-    SDValue Chain = Result.getValue(1);
 
     if (RelocM == Reloc::PIC_) {
       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
@@ -1609,25 +1910,21 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
 }
 
 static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG,
-                          const ARMSubtarget *Subtarget) {
+                               const ARMSubtarget *Subtarget) {
   DebugLoc dl = Op.getDebugLoc();
   SDValue Op5 = Op.getOperand(5);
-  SDValue Res;
   unsigned isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue();
-  if (isDeviceBarrier) {
-    if (Subtarget->hasV7Ops())
-      Res = DAG.getNode(ARMISD::SYNCBARRIER, dl, MVT::Other, Op.getOperand(0));
-    else
-      Res = DAG.getNode(ARMISD::SYNCBARRIER, dl, MVT::Other, Op.getOperand(0),
-                        DAG.getConstant(0, MVT::i32));
-  } else {
-    if (Subtarget->hasV7Ops())
-      Res = DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
-    else
-      Res = DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
-                        DAG.getConstant(0, MVT::i32));
-  }
-  return Res;
+  // v6 and v7 can both handle barriers directly, but need handled a bit
+  // differently. Thumb1 and pre-v6 ARM mode use a libcall instead and should
+  // never get here.
+  unsigned Opc = isDeviceBarrier ? ARMISD::SYNCBARRIER : ARMISD::MEMBARRIER;
+  if (Subtarget->hasV7Ops())
+    return DAG.getNode(Opc, dl, MVT::Other, Op.getOperand(0));
+  else if (Subtarget->hasV6Ops() && !Subtarget->isThumb1Only())
+    return DAG.getNode(Opc, dl, MVT::Other, Op.getOperand(0),
+                       DAG.getConstant(0, MVT::i32));
+  assert(0 && "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
+  return SDValue();
 }
 
 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
@@ -1712,7 +2009,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
   SDValue ArgValue2;
   if (NextVA.isMemLoc()) {
     MachineFrameInfo *MFI = MF.getFrameInfo();
-    int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true, false);
+    int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true);
 
     // Create load node to retrieve arguments from the stack.
     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
@@ -1768,8 +2065,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
           VA = ArgLocs[++i]; // skip ahead to next loc
           SDValue ArgValue2;
           if (VA.isMemLoc()) {
-            int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(),
-                                            true, false);
+            int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true);
             SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
             ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
                                     PseudoSourceValue::getFixedStack(FI), 0,
@@ -1836,8 +2132,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
       assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
 
       unsigned ArgSize = VA.getLocVT().getSizeInBits()/8;
-      int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(),
-                                      true, false);
+      int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(), true);
 
       // Create load nodes to retrieve arguments from the stack.
       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
@@ -1868,7 +2163,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
       AFI->setVarArgsFrameIndex(
         MFI->CreateFixedObject(VARegSaveSize,
                                ArgOffset + VARegSaveSize - VARegSize,
-                               true, false));
+                               true));
       SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(),
                                       getPointerTy());
 
@@ -1884,8 +2179,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
         SDValue Store =
           DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                       PseudoSourceValue::getFixedStack(AFI->getVarArgsFrameIndex()), 0,
-                       false, false, 0);
+               PseudoSourceValue::getFixedStack(AFI->getVarArgsFrameIndex()),
+               0, false, false, 0);
         MemOps.push_back(Store);
         FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
                           DAG.getConstant(4, getPointerTy()));
@@ -1895,8 +2190,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
                             &MemOps[0], MemOps.size());
     } else
       // This will point to the next argument passed via stack.
-      AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset,
-                                                       true, false));
+      AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true));
   }
 
   return Chain;
@@ -1978,9 +2272,44 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
   return DAG.getNode(CompareType, dl, MVT::Flag, LHS, RHS);
 }
 
+static bool canBitcastToInt(SDNode *Op) {
+  return Op->hasOneUse() && 
+    ISD::isNormalLoad(Op) &&
+    Op->getValueType(0) == MVT::f32;
+}
+
+static SDValue bitcastToInt(SDValue Op, SelectionDAG &DAG) {
+  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
+    return DAG.getLoad(MVT::i32, Op.getDebugLoc(),
+                       Ld->getChain(), Ld->getBasePtr(),
+                       Ld->getSrcValue(), Ld->getSrcValueOffset(),
+                       Ld->isVolatile(), Ld->isNonTemporal(),
+                       Ld->getAlignment());
+
+  llvm_unreachable("Unknown VFP cmp argument!");
+}
+
 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
-static SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
-                         DebugLoc dl) {
+SDValue
+ARMTargetLowering::getVFPCmp(SDValue &LHS, SDValue &RHS, ISD::CondCode CC,
+                             SDValue &ARMCC, SelectionDAG &DAG,
+                             DebugLoc dl) const {
+  if (UnsafeFPMath && FiniteOnlyFPMath() &&
+      (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
+       CC == ISD::SETNE || CC == ISD::SETUNE) &&
+      canBitcastToInt(LHS.getNode()) && canBitcastToInt(RHS.getNode())) {
+    // If unsafe fp math optimization is enabled and there are no othter uses of
+    // the CMP operands, and the condition code is EQ oe NE, we can optimize it
+    // to an integer comparison.
+    if (CC == ISD::SETOEQ)
+      CC = ISD::SETEQ;
+    else if (CC == ISD::SETUNE)
+      CC = ISD::SETNE;
+    LHS = bitcastToInt(LHS, DAG);
+    RHS = bitcastToInt(RHS, DAG);
+    return getARMCmp(LHS, RHS, CC, ARMCC, DAG, dl);
+  }
+
   SDValue Cmp;
   if (!isFloatingPointZero(RHS))
     Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Flag, LHS, RHS);
@@ -2010,13 +2339,13 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
-  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+  SDValue Cmp = getVFPCmp(LHS, RHS, CC, ARMCC, DAG, dl);
   SDValue Result = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
-                                 ARMCC, CCR, Cmp);
+                               ARMCC, CCR, Cmp);
   if (CondCode2 != ARMCC::AL) {
     SDValue ARMCC2 = DAG.getConstant(CondCode2, MVT::i32);
     // FIXME: Needs another CMP because flag can have but one use.
-    SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
+    SDValue Cmp2 = getVFPCmp(LHS, RHS, CC, ARMCC2, DAG, dl);
     Result = DAG.getNode(ARMISD::CMOV, dl, VT,
                          Result, TrueVal, ARMCC2, CCR, Cmp2);
   }
@@ -2043,8 +2372,8 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   ARMCC::CondCodes CondCode, CondCode2;
   FPCCToARMCC(CC, CondCode, CondCode2);
 
-  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
   SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32);
+  SDValue Cmp = getVFPCmp(LHS, RHS, CC, ARMCC, DAG, dl);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
   SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Flag);
   SDValue Ops[] = { Chain, Dest, ARMCC, CCR, Cmp };
@@ -2132,7 +2461,7 @@ static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(Opc, dl, VT, Op);
 }
 
-static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
+SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   // Implement fcopysign with a fabs and a conditional fneg.
   SDValue Tmp0 = Op.getOperand(0);
   SDValue Tmp1 = Op.getOperand(1);
@@ -2140,8 +2469,10 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   EVT SrcVT = Tmp1.getValueType();
   SDValue AbsVal = DAG.getNode(ISD::FABS, dl, VT, Tmp0);
-  SDValue Cmp = getVFPCmp(Tmp1, DAG.getConstantFP(0.0, SrcVT), DAG, dl);
   SDValue ARMCC = DAG.getConstant(ARMCC::LT, MVT::i32);
+  SDValue FP0 = DAG.getConstantFP(0.0, SrcVT);
+  SDValue Cmp = getVFPCmp(Tmp1, FP0,
+                          ISD::SETLT, ARMCC, DAG, dl);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
   return DAG.getNode(ARMISD::CNEG, dl, VT, AbsVal, AbsVal, ARMCC, CCR, Cmp);
 }
@@ -2206,7 +2537,8 @@ static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) {
                              DAG.getConstant(0, MVT::i32));
     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
                              DAG.getConstant(1, MVT::i32));
-    return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, DstVT,
+                       DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
   }
 
   // Turn f64->i64 into VMOVRRD.
@@ -2516,76 +2848,149 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   return Result;
 }
 
-/// isVMOVSplat - Check if the specified splat value corresponds to an immediate
-/// VMOV instruction, and if so, return the constant being splatted.
-static SDValue isVMOVSplat(uint64_t SplatBits, uint64_t SplatUndef,
-                           unsigned SplatBitSize, SelectionDAG &DAG) {
+/// isNEONModifiedImm - Check if the specified splat value corresponds to a
+/// valid vector constant for a NEON instruction with a "modified immediate"
+/// operand (e.g., VMOV).  If so, return either the constant being
+/// splatted or the encoded value, depending on the DoEncode parameter.
+static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
+                                 unsigned SplatBitSize, SelectionDAG &DAG,
+                                 bool isVMOV, bool DoEncode) {
+  unsigned OpCmode, Imm;
+  EVT VT;
+
+  // SplatBitSize is set to the smallest size that splats the vector, so a
+  // zero vector will always have SplatBitSize == 8.  However, NEON modified
+  // immediate instructions others than VMOV do not support the 8-bit encoding
+  // of a zero vector, and the default encoding of zero is supposed to be the
+  // 32-bit version.
+  if (SplatBits == 0)
+    SplatBitSize = 32;
+
   switch (SplatBitSize) {
   case 8:
-    // Any 1-byte value is OK.
+    // Any 1-byte value is OK.  Op=0, Cmode=1110.
     assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
-    return DAG.getTargetConstant(SplatBits, MVT::i8);
+    OpCmode = 0xe;
+    Imm = SplatBits;
+    VT = MVT::i8;
+    break;
 
   case 16:
     // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
-    if ((SplatBits & ~0xff) == 0 ||
-        (SplatBits & ~0xff00) == 0)
-      return DAG.getTargetConstant(SplatBits, MVT::i16);
-    break;
+    VT = MVT::i16;
+    if ((SplatBits & ~0xff) == 0) {
+      // Value = 0x00nn: Op=x, Cmode=100x.
+      OpCmode = 0x8;
+      Imm = SplatBits;
+      break;
+    }
+    if ((SplatBits & ~0xff00) == 0) {
+      // Value = 0xnn00: Op=x, Cmode=101x.
+      OpCmode = 0xa;
+      Imm = SplatBits >> 8;
+      break;
+    }
+    return SDValue();
 
   case 32:
     // NEON's 32-bit VMOV supports splat values where:
     // * only one byte is nonzero, or
     // * the least significant byte is 0xff and the second byte is nonzero, or
     // * the least significant 2 bytes are 0xff and the third is nonzero.
-    if ((SplatBits & ~0xff) == 0 ||
-        (SplatBits & ~0xff00) == 0 ||
-        (SplatBits & ~0xff0000) == 0 ||
-        (SplatBits & ~0xff000000) == 0)
-      return DAG.getTargetConstant(SplatBits, MVT::i32);
+    VT = MVT::i32;
+    if ((SplatBits & ~0xff) == 0) {
+      // Value = 0x000000nn: Op=x, Cmode=000x.
+      OpCmode = 0;
+      Imm = SplatBits;
+      break;
+    }
+    if ((SplatBits & ~0xff00) == 0) {
+      // Value = 0x0000nn00: Op=x, Cmode=001x.
+      OpCmode = 0x2;
+      Imm = SplatBits >> 8;
+      break;
+    }
+    if ((SplatBits & ~0xff0000) == 0) {
+      // Value = 0x00nn0000: Op=x, Cmode=010x.
+      OpCmode = 0x4;
+      Imm = SplatBits >> 16;
+      break;
+    }
+    if ((SplatBits & ~0xff000000) == 0) {
+      // Value = 0xnn000000: Op=x, Cmode=011x.
+      OpCmode = 0x6;
+      Imm = SplatBits >> 24;
+      break;
+    }
 
     if ((SplatBits & ~0xffff) == 0 &&
-        ((SplatBits | SplatUndef) & 0xff) == 0xff)
-      return DAG.getTargetConstant(SplatBits | 0xff, MVT::i32);
+        ((SplatBits | SplatUndef) & 0xff) == 0xff) {
+      // Value = 0x0000nnff: Op=x, Cmode=1100.
+      OpCmode = 0xc;
+      Imm = SplatBits >> 8;
+      SplatBits |= 0xff;
+      break;
+    }
 
     if ((SplatBits & ~0xffffff) == 0 &&
-        ((SplatBits | SplatUndef) & 0xffff) == 0xffff)
-      return DAG.getTargetConstant(SplatBits | 0xffff, MVT::i32);
+        ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
+      // Value = 0x00nnffff: Op=x, Cmode=1101.
+      OpCmode = 0xd;
+      Imm = SplatBits >> 16;
+      SplatBits |= 0xffff;
+      break;
+    }
 
     // Note: there are a few 32-bit splat values (specifically: 00ffff00,
     // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
     // VMOV.I32.  A (very) minor optimization would be to replicate the value
     // and fall through here to test for a valid 64-bit splat.  But, then the
     // caller would also need to check and handle the change in size.
-    break;
+    return SDValue();
 
   case 64: {
     // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
+    if (!isVMOV)
+      return SDValue();
     uint64_t BitMask = 0xff;
     uint64_t Val = 0;
+    unsigned ImmMask = 1;
+    Imm = 0;
     for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
-      if (((SplatBits | SplatUndef) & BitMask) == BitMask)
+      if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
         Val |= BitMask;
-      else if ((SplatBits & BitMask) != 0)
+        Imm |= ImmMask;
+      } else if ((SplatBits & BitMask) != 0) {
         return SDValue();
+      }
       BitMask <<= 8;
+      ImmMask <<= 1;
     }
-    return DAG.getTargetConstant(Val, MVT::i64);
+    // Op=1, Cmode=1110.
+    OpCmode = 0x1e;
+    SplatBits = Val;
+    VT = MVT::i64;
+    break;
   }
 
   default:
-    llvm_unreachable("unexpected size for isVMOVSplat");
-    break;
+    llvm_unreachable("unexpected size for isNEONModifiedImm");
+    return SDValue();
   }
 
-  return SDValue();
+  if (DoEncode) {
+    unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
+    return DAG.getTargetConstant(EncodedVal, MVT::i32);
+  }
+  return DAG.getTargetConstant(SplatBits, VT);
 }
 
-/// getVMOVImm - If this is a build_vector of constants which can be
-/// formed by using a VMOV instruction of the specified element size,
-/// return the constant being splatted.  The ByteSize field indicates the
-/// number of bytes of each element [1248].
-SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
+/// getNEONModImm - If this is a valid vector constant for a NEON instruction
+/// with a "modified immediate" operand (e.g., VMOV) of the specified element
+/// size, return the encoded value for that immediate.  The ByteSize field
+/// indicates the number of bytes of each element [1248].
+SDValue ARM::getNEONModImm(SDNode *N, unsigned ByteSize, bool isVMOV,
+                           SelectionDAG &DAG) {
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N);
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
@@ -2597,8 +3002,8 @@ SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
   if (SplatBitSize > ByteSize * 8)
     return SDValue();
 
-  return isVMOVSplat(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
-                     SplatBitSize, DAG);
+  return isNEONModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
+                           SplatBitSize, DAG, isVMOV, true);
 }
 
 static bool isVEXTMask(const SmallVectorImpl<int> &M, EVT VT,
@@ -2838,8 +3243,10 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
   bool HasAnyUndefs;
   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
     if (SplatBitSize <= 64) {
-      SDValue Val = isVMOVSplat(SplatBits.getZExtValue(),
-                                SplatUndef.getZExtValue(), SplatBitSize, DAG);
+      // Check if an immediate VMOV works.
+      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+                                      SplatUndef.getZExtValue(),
+                                      SplatBitSize, DAG, true, false);
       if (Val.getNode())
         return BuildSplat(Val, VT, DAG, dl);
     }
@@ -2883,21 +3290,17 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
     return DAG.getNode(ARMISD::VDUP, dl, VT, Value);
 
   // Vectors with 32- or 64-bit elements can be built by directly assigning
-  // the subregisters.
+  // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
+  // will be legalized.
   if (EltSize >= 32) {
     // Do the expansion with floating-point types, since that is what the VFP
     // registers are defined to use, and since i64 is not legal.
     EVT EltVT = EVT::getFloatingPointVT(EltSize);
     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
-    SDValue Val = DAG.getUNDEF(VecVT);
-    for (unsigned i = 0; i < NumElts; ++i) {
-      SDValue Elt = Op.getOperand(i);
-      if (Elt.getOpcode() == ISD::UNDEF)
-        continue;
-      Elt = DAG.getNode(ISD::BIT_CONVERT, dl, EltVT, Elt);
-      Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Val, Elt,
-                        DAG.getConstant(i, MVT::i32));
-    }
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0; i < NumElts; ++i)
+      Ops.push_back(DAG.getNode(ISD::BIT_CONVERT, dl, EltVT, Op.getOperand(i)));
+    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
     return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Val);
   }
 
@@ -2934,7 +3337,9 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
   bool ReverseVEXT;
   unsigned Imm, WhichResult;
 
-  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  return (EltSize >= 32 ||
+          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
           isVREVMask(M, VT, 64) ||
           isVREVMask(M, VT, 32) ||
           isVREVMask(M, VT, 16) ||
@@ -3032,59 +3437,62 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   // of the same time so that they get CSEd properly.
   SVN->getMask(ShuffleMask);
 
-  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
-    int Lane = SVN->getSplatIndex();
-    // If this is undef splat, generate it via "just" vdup, if possible.
-    if (Lane == -1) Lane = 0;
-
-    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
-      return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (EltSize <= 32) {
+    if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
+      int Lane = SVN->getSplatIndex();
+      // If this is undef splat, generate it via "just" vdup, if possible.
+      if (Lane == -1) Lane = 0;
+
+      if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+        return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
+      }
+      return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
+                         DAG.getConstant(Lane, MVT::i32));
     }
-    return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
-                       DAG.getConstant(Lane, MVT::i32));
-  }
 
-  bool ReverseVEXT;
-  unsigned Imm;
-  if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
-    if (ReverseVEXT)
-      std::swap(V1, V2);
-    return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
-                       DAG.getConstant(Imm, MVT::i32));
-  }
-
-  if (isVREVMask(ShuffleMask, VT, 64))
-    return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
-  if (isVREVMask(ShuffleMask, VT, 32))
-    return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
-  if (isVREVMask(ShuffleMask, VT, 16))
-    return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
-
-  // Check for Neon shuffles that modify both input vectors in place.
-  // If both results are used, i.e., if there are two shuffles with the same
-  // source operands and with masks corresponding to both results of one of
-  // these operations, DAG memoization will ensure that a single node is
-  // used for both shuffles.
-  unsigned WhichResult;
-  if (isVTRNMask(ShuffleMask, VT, WhichResult))
-    return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
-                       V1, V2).getValue(WhichResult);
-  if (isVUZPMask(ShuffleMask, VT, WhichResult))
-    return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
-                       V1, V2).getValue(WhichResult);
-  if (isVZIPMask(ShuffleMask, VT, WhichResult))
-    return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
-                       V1, V2).getValue(WhichResult);
+    bool ReverseVEXT;
+    unsigned Imm;
+    if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
+      if (ReverseVEXT)
+        std::swap(V1, V2);
+      return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
+                         DAG.getConstant(Imm, MVT::i32));
+    }
 
-  if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
-    return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
-                       V1, V1).getValue(WhichResult);
-  if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
-    return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
-                       V1, V1).getValue(WhichResult);
-  if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
-    return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
-                       V1, V1).getValue(WhichResult);
+    if (isVREVMask(ShuffleMask, VT, 64))
+      return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
+    if (isVREVMask(ShuffleMask, VT, 32))
+      return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
+    if (isVREVMask(ShuffleMask, VT, 16))
+      return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
+
+    // Check for Neon shuffles that modify both input vectors in place.
+    // If both results are used, i.e., if there are two shuffles with the same
+    // source operands and with masks corresponding to both results of one of
+    // these operations, DAG memoization will ensure that a single node is
+    // used for both shuffles.
+    unsigned WhichResult;
+    if (isVTRNMask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+                         V1, V2).getValue(WhichResult);
+    if (isVUZPMask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+                         V1, V2).getValue(WhichResult);
+    if (isVZIPMask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+                         V1, V2).getValue(WhichResult);
+
+    if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+                         V1, V1).getValue(WhichResult);
+    if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+                         V1, V1).getValue(WhichResult);
+    if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+                         V1, V1).getValue(WhichResult);
+  }
 
   // If the shuffle is not directly supported and it has 4 elements, use
   // the PerfectShuffle-generated table to synthesize it from other shuffles.
@@ -3108,8 +3516,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
       return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
   }
 
-  // Implement shuffles with 32- or 64-bit elements as subreg copies.
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
   if (EltSize >= 32) {
     // Do the expansion with floating-point types, since that is what the VFP
     // registers are defined to use, and since i64 is not legal.
@@ -3117,17 +3524,17 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
     V1 = DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, V1);
     V2 = DAG.getNode(ISD::BIT_CONVERT, dl, VecVT, V2);
-    SDValue Val = DAG.getUNDEF(VecVT);
+    SmallVector<SDValue, 8> Ops;
     for (unsigned i = 0; i < NumElts; ++i) {
       if (ShuffleMask[i] < 0)
-        continue;
-      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
-                                ShuffleMask[i] < (int)NumElts ? V1 : V2,
-                                DAG.getConstant(ShuffleMask[i] & (NumElts-1),
-                                                MVT::i32));
-      Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Val,
-                        Elt, DAG.getConstant(i, MVT::i32));
+        Ops.push_back(DAG.getUNDEF(EltVT));
+      else
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                  ShuffleMask[i] < (int)NumElts ? V1 : V2,
+                                  DAG.getConstant(ShuffleMask[i] & (NumElts-1),
+                                                  MVT::i32)));
     }
+    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
     return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Val);
   }
 
@@ -3277,7 +3684,12 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   MF->insert(It, loop1MBB);
   MF->insert(It, loop2MBB);
   MF->insert(It, exitMBB);
-  exitMBB->transferSuccessors(BB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   //  thisMBB:
   //   ...
@@ -3315,7 +3727,7 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   //   ...
   BB = exitMBB;
 
-  MF->DeleteMachineInstr(MI);   // The instruction is gone now.
+  MI->eraseFromParent();   // The instruction is gone now.
 
   return BB;
 }
@@ -3358,7 +3770,12 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MF->insert(It, loopMBB);
   MF->insert(It, exitMBB);
-  exitMBB->transferSuccessors(BB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   unsigned scratch = RegInfo.createVirtualRegister(ARM::GPRRegisterClass);
@@ -3403,7 +3820,7 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   //   ...
   BB = exitMBB;
 
-  MF->DeleteMachineInstr(MI);   // The instruction is gone now.
+  MI->eraseFromParent();   // The instruction is gone now.
 
   return BB;
 }
@@ -3488,22 +3905,21 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     MachineFunction *F = BB->getParent();
     MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
-    BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB)
-      .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg());
     F->insert(It, copy0MBB);
     F->insert(It, sinkMBB);
-    // Update machine-CFG edges by first adding all successors of the current
-    // block to the new block which will contain the Phi node for the select.
-    for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 
-           E = BB->succ_end(); I != E; ++I)
-      sinkMBB->addSuccessor(*I);
-    // Next, remove all successors of the current block, and add the true
-    // and fallthrough blocks as its successors.
-    while (!BB->succ_empty())
-      BB->removeSuccessor(BB->succ_begin());
+
+    // Transfer the remainder of BB and its successor edges to sinkMBB.
+    sinkMBB->splice(sinkMBB->begin(), BB,
+                    llvm::next(MachineBasicBlock::iterator(MI)),
+                    BB->end());
+    sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
     BB->addSuccessor(copy0MBB);
     BB->addSuccessor(sinkMBB);
 
+    BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB)
+      .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg());
+
     //  copy0MBB:
     //   %FalseValue = ...
     //   # fallthrough to sinkMBB
@@ -3516,11 +3932,12 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
     //  ...
     BB = sinkMBB;
-    BuildMI(BB, dl, TII->get(ARM::PHI), MI->getOperand(0).getReg())
+    BuildMI(*BB, BB->begin(), dl,
+            TII->get(ARM::PHI), MI->getOperand(0).getReg())
       .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
       .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
 
-    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+    MI->eraseFromParent();   // The pseudo instruction is gone now.
     return BB;
   }
 
@@ -3541,7 +3958,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
       const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(SrcReg);
       unsigned CopyOpc = (RC == ARM::tGPRRegisterClass)
         ? ARM::tMOVtgpr2gpr : ARM::tMOVgpr2gpr;
-      BuildMI(BB, dl, TII->get(CopyOpc), ARM::SP)
+      BuildMI(*BB, MI, dl, TII->get(CopyOpc), ARM::SP)
         .addReg(SrcReg, getKillRegState(SrcIsKill));
     }
 
@@ -3573,7 +3990,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
       NeedPred = true; NeedCC = true; NeedOp3 = true;
       break;
     }
-    MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(OpOpc), ARM::SP);
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(OpOpc), ARM::SP);
     if (OpOpc == ARM::tAND)
       AddDefaultT1CC(MIB);
     MIB.addReg(ARM::SP);
@@ -3589,10 +4006,10 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(DstReg);
     unsigned CopyOpc = (RC == ARM::tGPRRegisterClass)
       ? ARM::tMOVgpr2tgpr : ARM::tMOVgpr2gpr;
-    BuildMI(BB, dl, TII->get(CopyOpc))
+    BuildMI(*BB, MI, dl, TII->get(CopyOpc))
       .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead))
       .addReg(ARM::SP);
-    MF->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+    MI->eraseFromParent();   // The pseudo instruction is gone now.
     return BB;
   }
   }
@@ -3893,7 +4310,8 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
       // Narrowing shifts require an immediate right shift.
       if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
         break;
-      llvm_unreachable("invalid shift count for narrowing vector shift intrinsic");
+      llvm_unreachable("invalid shift count for narrowing vector shift "
+                       "intrinsic");
 
     default:
       llvm_unreachable("unhandled vector shift");
@@ -4156,14 +4574,13 @@ bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
   if (!Subtarget->hasV6Ops())
     // Pre-v6 does not support unaligned mem access.
     return false;
-  else {
-    // v6+ may or may not support unaligned mem access depending on the system
-    // configuration.
-    // FIXME: This is pretty conservative. Should we provide cmdline option to
-    // control the behaviour?
-    if (!Subtarget->isTargetDarwin())
-      return false;
-  }
+
+  // v6+ may or may not support unaligned mem access depending on the system
+  // configuration.
+  // FIXME: This is pretty conservative. Should we provide cmdline option to
+  // control the behaviour?
+  if (!Subtarget->isTargetDarwin())
+    return false;
 
   switch (VT.getSimpleVT().SimpleTy) {
   default:
@@ -4619,7 +5036,7 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     }
   }
   if (StringRef("{cc}").equals_lower(Constraint))
-    return std::make_pair(0U, ARM::CCRRegisterClass);
+    return std::make_pair(unsigned(ARM::CPSR), ARM::CCRRegisterClass);
 
   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
 }
@@ -4669,7 +5086,6 @@ getRegClassForInlineAsmConstraint(const std::string &Constraint,
 /// vector.  If it is invalid, don't add anything to Ops.
 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                      char Constraint,
-                                                     bool hasMemory,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
   SDValue Result(0, 0);
@@ -4818,8 +5234,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     Ops.push_back(Result);
     return;
   }
-  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
-                                                      Ops, DAG);
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
 bool
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 9c7517c88195..3a3866928a0e 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -70,6 +70,8 @@ namespace llvm {
       EH_SJLJ_SETJMP,    // SjLj exception handling setjmp.
       EH_SJLJ_LONGJMP,   // SjLj exception handling longjmp.
 
+      TC_RETURN,    // Tail call return pseudo.
+
       THREAD_POINTER,
 
       DYN_ALLOC,    // Dynamic allocation on the stack.
@@ -133,6 +135,13 @@ namespace llvm {
       VUZP,         // unzip (deinterleave)
       VTRN,         // transpose
 
+      // Operands of the standard BUILD_VECTOR node are not legalized, which
+      // is fine if BUILD_VECTORs are always lowered to shuffles or other
+      // operations, but for ARM some BUILD_VECTORs are legal as-is and their
+      // operands need to be legalized.  Define an ARM-specific version of
+      // BUILD_VECTOR for this purpose.
+      BUILD_VECTOR,
+
       // Floating-point max and min:
       FMAX,
       FMIN
@@ -141,11 +150,12 @@ namespace llvm {
 
   /// Define some predicates that are used for node matching.
   namespace ARM {
-    /// getVMOVImm - If this is a build_vector of constants which can be
-    /// formed by using a VMOV instruction of the specified element size,
-    /// return the constant being splatted.  The ByteSize field indicates the
-    /// number of bytes of each element [1248].
-    SDValue getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
+    /// getNEONModImm - If this is a valid vector constant for a NEON
+    /// instruction with a "modified immediate" operand (e.g., VMOV) of the
+    /// specified element size, return the encoded value for that immediate.
+    /// The ByteSize field indicates the number of bytes of each element [1248].
+    SDValue getNEONModImm(SDNode *N, unsigned ByteSize, bool isVMOV,
+                          SelectionDAG &DAG);
 
     /// getVFPf32Imm / getVFPf64Imm - If the given fp immediate can be
     /// materialized with a VMOV.f32 / VMOV.f64 (i.e. fconsts / fconstd)
@@ -189,9 +199,9 @@ namespace llvm {
     bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
-    /// icmp immediate, that is the target has icmp instructions which can compare
-    /// a register against the immediate without having to materialize the
-    /// immediate into a register.
+    /// icmp immediate, that is the target has icmp instructions which can
+    /// compare a register against the immediate without having to materialize
+    /// the immediate into a register.
     virtual bool isLegalICmpImmediate(int64_t Imm) const;
 
     /// getPreIndexedAddressParts - returns true by value, base pointer and
@@ -232,7 +242,6 @@ namespace llvm {
     /// being processed is 'm'.
     virtual void LowerAsmOperandForConstraint(SDValue Op,
                                               char ConstraintLetter,
-                                              bool hasMemory,
                                               std::vector<SDValue> &Ops,
                                               SelectionDAG &DAG) const;
 
@@ -282,7 +291,8 @@ namespace llvm {
                                  SDValue &Root, SelectionDAG &DAG,
                                  DebugLoc dl) const;
 
-    CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return, bool isVarArg) const;
+    CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return,
+                                  bool isVarArg) const;
     SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
                              DebugLoc dl, SelectionDAG &DAG,
                              const CCValAssign &VA,
@@ -303,6 +313,7 @@ namespace llvm {
     SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
@@ -327,18 +338,34 @@ namespace llvm {
                 CallingConv::ID CallConv, bool isVarArg,
                 bool &isTailCall,
                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<SDValue> &OutVals,
                 const SmallVectorImpl<ISD::InputArg> &Ins,
                 DebugLoc dl, SelectionDAG &DAG,
                 SmallVectorImpl<SDValue> &InVals) const;
 
+    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+    /// for tail call optimization. Targets which want to do tail call
+    /// optimization should implement this function.
+    bool IsEligibleForTailCallOptimization(SDValue Callee,
+                                           CallingConv::ID CalleeCC,
+                                           bool isVarArg,
+                                           bool isCalleeStructRet,
+                                           bool isCallerStructRet,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                           SelectionDAG& DAG) const;
     virtual SDValue
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
                   DebugLoc dl, SelectionDAG &DAG) const;
 
     SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                       SDValue &ARMCC, SelectionDAG &DAG, DebugLoc dl) const;
+    SDValue getVFPCmp(SDValue &LHS, SDValue &RHS, ISD::CondCode CC,
+                      SDValue &ARMCC, SelectionDAG &DAG, DebugLoc dl) const;
 
     MachineBasicBlock *EmitAtomicCmpSwap(MachineInstr *MI,
                                          MachineBasicBlock *BB,
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index d487df16df5e..ac568e75ccc4 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -50,27 +50,23 @@ def VFPLdStMulFrm : Format<22>;
 def VFPMiscFrm    : Format<23>;
 
 def ThumbFrm      : Format<24>;
-
-def NEONFrm       : Format<25>;
-def NEONGetLnFrm  : Format<26>;
-def NEONSetLnFrm  : Format<27>;
-def NEONDupFrm    : Format<28>;
-
-def MiscFrm       : Format<29>;
-def ThumbMiscFrm  : Format<30>;
-
-def NLdStFrm       : Format<31>;
-def N1RegModImmFrm : Format<32>;
-def N2RegFrm       : Format<33>;
-def NVCVTFrm       : Format<34>;
-def NVDupLnFrm     : Format<35>;
-def N2RegVShLFrm   : Format<36>;
-def N2RegVShRFrm   : Format<37>;
-def N3RegFrm       : Format<38>;
-def N3RegVShFrm    : Format<39>;
-def NVExtFrm       : Format<40>;
-def NVMulSLFrm     : Format<41>;
-def NVTBLFrm       : Format<42>;
+def MiscFrm       : Format<25>;
+
+def NGetLnFrm     : Format<26>;
+def NSetLnFrm     : Format<27>;
+def NDupFrm       : Format<28>;
+def NLdStFrm      : Format<29>;
+def N1RegModImmFrm: Format<30>;
+def N2RegFrm      : Format<31>;
+def NVCVTFrm      : Format<32>;
+def NVDupLnFrm    : Format<33>;
+def N2RegVShLFrm  : Format<34>;
+def N2RegVShRFrm  : Format<35>;
+def N3RegFrm      : Format<36>;
+def N3RegVShFrm   : Format<37>;
+def NVExtFrm      : Format<38>;
+def NVMulSLFrm    : Format<39>;
+def NVTBLFrm      : Format<40>;
 
 // Misc flags.
 
@@ -1653,17 +1649,17 @@ class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
 class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
                 dag oops, dag iops, InstrItinClass itin,
                 string opc, string dt, string asm, list<dag> pattern>
-  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONGetLnFrm, itin,
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NGetLnFrm, itin,
              opc, dt, asm, pattern>;
 class NVSetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
                 dag oops, dag iops, InstrItinClass itin,
                 string opc, string dt, string asm, list<dag> pattern>
-  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONSetLnFrm, itin,
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NSetLnFrm, itin,
              opc, dt, asm, pattern>;
 class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
             dag oops, dag iops, InstrItinClass itin,
             string opc, string dt, string asm, list<dag> pattern>
-  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, itin,
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NDupFrm, itin,
              opc, dt, asm, pattern>;
 
 // Vector Duplicate Lane (from scalar to all elements)
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 85f6b4019a8e..ba228ffac8ed 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -63,7 +63,7 @@ unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
 void ARMInstrInfo::
 reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
               unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig,
-              const TargetRegisterInfo *TRI) const {
+              const TargetRegisterInfo &TRI) const {
   DebugLoc dl = Orig->getDebugLoc();
   unsigned Opcode = Orig->getOpcode();
   switch (Opcode) {
diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h
index d4199d1267fd..4563ffea7b9c 100644
--- a/lib/Target/ARM/ARMInstrInfo.h
+++ b/lib/Target/ARM/ARMInstrInfo.h
@@ -35,7 +35,7 @@ public:
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      unsigned DestReg, unsigned SubIdx,
                      const MachineInstr *Orig,
-                     const TargetRegisterInfo *TRI) const;
+                     const TargetRegisterInfo &TRI) const;
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index f3156d94693b..c73e204a26b3 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -53,6 +53,8 @@ def SDT_ARMSYNCBARRIERV7 : SDTypeProfile<0, 0, []>;
 def SDT_ARMMEMBARRIERV6  : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 def SDT_ARMSYNCBARRIERV6 : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
+def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
 // Node definitions.
 def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
 def ARMWrapperJT     : SDNode<"ARMISD::WrapperJT",   SDTIntBinOp>;
@@ -117,6 +119,9 @@ def ARMSyncBarrierV6 : SDNode<"ARMISD::SYNCBARRIER", SDT_ARMMEMBARRIERV6,
 
 def ARMrbit          : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>;
 
+def ARMtcret         : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET, 
+                        [SDNPHasChain,  SDNPOptInFlag, SDNPVariadic]>;
+
 //===----------------------------------------------------------------------===//
 // ARM Instruction Predicate Definitions.
 //
@@ -858,13 +863,13 @@ def LEApcrel : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, pred:$p),
                     Pseudo, IIC_iALUi,
                     "adr$p\t$dst, #$label", []>;
 
+} // neverHasSideEffects
 def LEApcrelJT : AXI1<0x0, (outs GPR:$dst),
                            (ins i32imm:$label, nohash_imm:$id, pred:$p),
                       Pseudo, IIC_iALUi,
                       "adr$p\t$dst, #${label}_${id}", []> {
     let Inst{25} = 1;
 }
-} // neverHasSideEffects
 
 //===----------------------------------------------------------------------===//
 //  Control Flow Instructions.
@@ -1026,6 +1031,74 @@ let isCall = 1,
   }
 }
 
+// Tail calls.
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+  // Darwin versions.
+  let Defs = [R0, R1, R2, R3, R9, R12,
+              D0, D1, D2, D3, D4, D5, D6, D7,
+              D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26,
+              D27, D28, D29, D30, D31, PC],
+      Uses = [SP] in {
+    def TCRETURNdi : AInoP<(outs), (ins i32imm:$dst, variable_ops),
+                       Pseudo, IIC_Br,
+                       "@TC_RETURN","\t$dst", []>, Requires<[IsDarwin]>;
+
+    def TCRETURNri : AInoP<(outs), (ins tcGPR:$dst, variable_ops),
+                       Pseudo, IIC_Br,
+                       "@TC_RETURN","\t$dst", []>, Requires<[IsDarwin]>;
+
+    def TAILJMPd : ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops),
+                   IIC_Br, "b\t$dst  @ TAILCALL",
+                   []>, Requires<[IsDarwin]>;
+
+    def TAILJMPdt: ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops),
+                   IIC_Br, "b.w\t$dst  @ TAILCALL",
+                   []>, Requires<[IsDarwin]>;
+
+    def TAILJMPr : AXI<(outs), (ins tcGPR:$dst, variable_ops),
+                     BrMiscFrm, IIC_Br, "bx\t$dst  @ TAILCALL",
+                   []>, Requires<[IsDarwin]> {
+                   let Inst{7-4}   = 0b0001;
+                   let Inst{19-8}  = 0b111111111111;
+                   let Inst{27-20} = 0b00010010;
+                   let Inst{31-28} = 0b1110;
+    }
+  }
+
+  // Non-Darwin versions (the difference is R9).
+  let Defs = [R0, R1, R2, R3, R12,
+              D0, D1, D2, D3, D4, D5, D6, D7,
+              D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26,
+              D27, D28, D29, D30, D31, PC],
+      Uses = [SP] in {
+    def TCRETURNdiND : AInoP<(outs), (ins i32imm:$dst, variable_ops),
+                       Pseudo, IIC_Br,
+                       "@TC_RETURN","\t$dst", []>, Requires<[IsNotDarwin]>;
+
+    def TCRETURNriND : AInoP<(outs), (ins tcGPR:$dst, variable_ops),
+                       Pseudo, IIC_Br,
+                       "@TC_RETURN","\t$dst", []>, Requires<[IsNotDarwin]>;
+
+    def TAILJMPdND : ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops),
+                   IIC_Br, "b\t$dst  @ TAILCALL",
+                   []>, Requires<[IsARM, IsNotDarwin]>;
+
+    def TAILJMPdNDt : ABXI<0b1010, (outs), (ins brtarget:$dst, variable_ops),
+                   IIC_Br, "b.w\t$dst  @ TAILCALL",
+                   []>, Requires<[IsThumb, IsNotDarwin]>;
+
+    def TAILJMPrND : AXI<(outs), (ins tcGPR:$dst, variable_ops),
+                     BrMiscFrm, IIC_Br, "bx\t$dst  @ TAILCALL",
+                   []>, Requires<[IsNotDarwin]> {
+                   let Inst{7-4}   = 0b0001;
+                   let Inst{19-8}  = 0b111111111111;
+                   let Inst{27-20} = 0b00010010;
+                   let Inst{31-28} = 0b1110;
+    }
+  }
+}
+
 let isBranch = 1, isTerminator = 1 in {
   // B is "predicable" since it can be xformed into a Bcc.
   let isBarrier = 1 in {
@@ -1397,6 +1470,14 @@ def MOVr : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), DPFrm, IIC_iMOVr,
   let Inst{25} = 0;
 }
 
+// A version for the smaller set of tail call registers.
+let neverHasSideEffects = 1 in
+def MOVr_TC : AsI1<0b1101, (outs tcGPR:$dst), (ins tcGPR:$src), DPFrm, 
+                IIC_iMOVr, "mov", "\t$dst, $src", []>, UnaryDP {
+  let Inst{11-4} = 0b00000000;
+  let Inst{25} = 0;
+}
+
 def MOVs : AsI1<0b1101, (outs GPR:$dst), (ins so_reg:$src),
                 DPSoRegFrm, IIC_iMOVsr,
                 "mov", "\t$dst, $src", [(set GPR:$dst, so_reg:$src)]>, UnaryDP {
@@ -2530,31 +2611,30 @@ let Defs =
   [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR,  D0,
     D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
     D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30,
-    D31 ] in {
+    D31 ], hasSideEffects = 1, isBarrier = 1 in {
   def Int_eh_sjlj_setjmp : XI<(outs), (ins GPR:$src, GPR:$val),
                                AddrModeNone, SizeSpecial, IndexModeNone,
                                Pseudo, NoItinerary,
-                               "str\tsp, [$src, #+8] ${:comment} eh_setjmp begin\n\t"
-                               "add\t$val, pc, #8\n\t"
-                               "str\t$val, [$src, #+4]\n\t"
-                               "mov\tr0, #0\n\t"
-                               "add\tpc, pc, #0\n\t"
-                               "mov\tr0, #1 ${:comment} eh_setjmp end", "",
+                           "add\t$val, pc, #8\t${:comment} eh_setjmp begin\n\t"
+                           "str\t$val, [$src, #+4]\n\t"
+                           "mov\tr0, #0\n\t"
+                           "add\tpc, pc, #0\n\t"
+                           "mov\tr0, #1 ${:comment} eh_setjmp end", "",
                          [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
                            Requires<[IsARM, HasVFP2]>;
 }
 
 let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR ] in {
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR ],
+  hasSideEffects = 1, isBarrier = 1 in {
   def Int_eh_sjlj_setjmp_nofp : XI<(outs), (ins GPR:$src, GPR:$val),
                                    AddrModeNone, SizeSpecial, IndexModeNone,
                                    Pseudo, NoItinerary,
-                                   "str\tsp, [$src, #+8] ${:comment} eh_setjmp begin\n\t"
-                                   "add\t$val, pc, #8\n\t"
-                                   "str\t$val, [$src, #+4]\n\t"
-                                   "mov\tr0, #0\n\t"
-                                   "add\tpc, pc, #0\n\t"
-                                   "mov\tr0, #1 ${:comment} eh_setjmp end", "",
+                           "add\t$val, pc, #8\n ${:comment} eh_setjmp begin\n\t"
+                           "str\t$val, [$src, #+4]\n\t"
+                           "mov\tr0, #0\n\t"
+                           "add\tpc, pc, #0\n\t"
+                           "mov\tr0, #1 ${:comment} eh_setjmp end", "",
                          [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
                                 Requires<[IsARM, NoVFP]>;
 }
@@ -2621,6 +2701,24 @@ def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id),
 
 // TODO: add,sub,and, 3-instr forms?
 
+// Tail calls
+def : ARMPat<(ARMtcret tcGPR:$dst),
+          (TCRETURNri tcGPR:$dst)>, Requires<[IsDarwin]>;
+
+def : ARMPat<(ARMtcret (i32 tglobaladdr:$dst)),
+          (TCRETURNdi texternalsym:$dst)>, Requires<[IsDarwin]>;
+
+def : ARMPat<(ARMtcret (i32 texternalsym:$dst)),
+          (TCRETURNdi texternalsym:$dst)>, Requires<[IsDarwin]>;
+
+def : ARMPat<(ARMtcret tcGPR:$dst),
+          (TCRETURNriND tcGPR:$dst)>, Requires<[IsNotDarwin]>;
+
+def : ARMPat<(ARMtcret (i32 tglobaladdr:$dst)),
+          (TCRETURNdiND texternalsym:$dst)>, Requires<[IsNotDarwin]>;
+
+def : ARMPat<(ARMtcret (i32 texternalsym:$dst)),
+          (TCRETURNdiND texternalsym:$dst)>, Requires<[IsNotDarwin]>;
 
 // Direct calls
 def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>,
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 197ec16eedea..a84315f73038 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -98,17 +98,8 @@ def NEONfmin      : SDNode<"ARMISD::FMIN", SDTARMFMAX>;
 // NEON operand definitions
 //===----------------------------------------------------------------------===//
 
-def h8imm  : Operand<i8> {
-  let PrintMethod = "printHex8ImmOperand";
-}
-def h16imm : Operand<i16> {
-  let PrintMethod = "printHex16ImmOperand";
-}
-def h32imm : Operand<i32> {
-  let PrintMethod = "printHex32ImmOperand";
-}
-def h64imm : Operand<i64> {
-  let PrintMethod = "printHex64ImmOperand";
+def nModImm : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
 }
 
 //===----------------------------------------------------------------------===//
@@ -812,11 +803,6 @@ def DSubReg_f64_reg : SDNodeXForm<imm, [{
   assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
   return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue(), MVT::i32);
 }]>;
-def DSubReg_f64_other_reg : SDNodeXForm<imm, [{
-  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
-  return CurDAG->getTargetConstant(ARM::dsub_0 + (1 - N->getZExtValue()),
-                                   MVT::i32);
-}]>;
 
 // Extract S sub-registers of Q/D registers.
 def SSubReg_f32_reg : SDNodeXForm<imm, [{
@@ -2282,7 +2268,7 @@ def  VCEQfq   : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
                      NEONvceq, 1>;
 // For disassembly only.
 defm VCEQz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i",
-                           "$dst, $src, #0">;
+                            "$dst, $src, #0">;
 
 //   VCGE     : Vector Compare Greater Than or Equal
 defm VCGEs    : N3V_QHS<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
@@ -2834,73 +2820,70 @@ def  VMOVQQQQ : PseudoInst<(outs QQQQPR:$dst), (ins QQQQPR:$src),
 
 // VMOV_get_imm8 xform function: convert build_vector to VMOV.i8 imm.
 def VMOV_get_imm8 : SDNodeXForm<build_vector, [{
-  return ARM::getVMOVImm(N, 1, *CurDAG);
+  return ARM::getNEONModImm(N, 1, true, *CurDAG);
 }]>;
 def vmovImm8 : PatLeaf<(build_vector), [{
-  return ARM::getVMOVImm(N, 1, *CurDAG).getNode() != 0;
+  return ARM::getNEONModImm(N, 1, true, *CurDAG).getNode() != 0;
 }], VMOV_get_imm8>;
 
 // VMOV_get_imm16 xform function: convert build_vector to VMOV.i16 imm.
 def VMOV_get_imm16 : SDNodeXForm<build_vector, [{
-  return ARM::getVMOVImm(N, 2, *CurDAG);
+  return ARM::getNEONModImm(N, 2, true, *CurDAG);
 }]>;
 def vmovImm16 : PatLeaf<(build_vector), [{
-  return ARM::getVMOVImm(N, 2, *CurDAG).getNode() != 0;
+  return ARM::getNEONModImm(N, 2, true, *CurDAG).getNode() != 0;
 }], VMOV_get_imm16>;
 
 // VMOV_get_imm32 xform function: convert build_vector to VMOV.i32 imm.
 def VMOV_get_imm32 : SDNodeXForm<build_vector, [{
-  return ARM::getVMOVImm(N, 4, *CurDAG);
+  return ARM::getNEONModImm(N, 4, true, *CurDAG);
 }]>;
 def vmovImm32 : PatLeaf<(build_vector), [{
-  return ARM::getVMOVImm(N, 4, *CurDAG).getNode() != 0;
+  return ARM::getNEONModImm(N, 4, true, *CurDAG).getNode() != 0;
 }], VMOV_get_imm32>;
 
 // VMOV_get_imm64 xform function: convert build_vector to VMOV.i64 imm.
 def VMOV_get_imm64 : SDNodeXForm<build_vector, [{
-  return ARM::getVMOVImm(N, 8, *CurDAG);
+  return ARM::getNEONModImm(N, 8, true, *CurDAG);
 }]>;
 def vmovImm64 : PatLeaf<(build_vector), [{
-  return ARM::getVMOVImm(N, 8, *CurDAG).getNode() != 0;
+  return ARM::getNEONModImm(N, 8, true, *CurDAG).getNode() != 0;
 }], VMOV_get_imm64>;
 
-// Note: Some of the cmode bits in the following VMOV instructions need to
-// be encoded based on the immed values.
-
 let isReMaterializable = 1 in {
 def VMOVv8i8  : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$dst),
-                         (ins h8imm:$SIMM), IIC_VMOVImm,
+                         (ins nModImm:$SIMM), IIC_VMOVImm,
                          "vmov", "i8", "$dst, $SIMM", "",
                          [(set DPR:$dst, (v8i8 vmovImm8:$SIMM))]>;
 def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$dst),
-                         (ins h8imm:$SIMM), IIC_VMOVImm,
+                         (ins nModImm:$SIMM), IIC_VMOVImm,
                          "vmov", "i8", "$dst, $SIMM", "",
                          [(set QPR:$dst, (v16i8 vmovImm8:$SIMM))]>;
 
-def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,?}, 0, 0, {?}, 1, (outs DPR:$dst),
-                         (ins h16imm:$SIMM), IIC_VMOVImm,
+def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 0, 1, (outs DPR:$dst),
+                         (ins nModImm:$SIMM), IIC_VMOVImm,
                          "vmov", "i16", "$dst, $SIMM", "",
                          [(set DPR:$dst, (v4i16 vmovImm16:$SIMM))]>;
-def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,?}, 0, 1, {?}, 1, (outs QPR:$dst),
-                         (ins h16imm:$SIMM), IIC_VMOVImm,
+def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 0, 1, (outs QPR:$dst),
+                         (ins nModImm:$SIMM), IIC_VMOVImm,
                          "vmov", "i16", "$dst, $SIMM", "",
                          [(set QPR:$dst, (v8i16 vmovImm16:$SIMM))]>;
 
-def VMOVv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, {?}, 1, (outs DPR:$dst),
-                         (ins h32imm:$SIMM), IIC_VMOVImm,
+def VMOVv2i32 : N1ModImm<1, 0b000, {0,?,?,0}, 0, 0, 0, 1, (outs DPR:$dst),
+                         (ins nModImm:$SIMM), IIC_VMOVImm,
                          "vmov", "i32", "$dst, $SIMM", "",
                          [(set DPR:$dst, (v2i32 vmovImm32:$SIMM))]>;
-def VMOVv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, {?}, 1, (outs QPR:$dst),
-                         (ins h32imm:$SIMM), IIC_VMOVImm,
+def VMOVv4i32 : N1ModImm<1, 0b000, {0,?,?,0}, 0, 1, 0, 1, (outs QPR:$dst),
+                         (ins nModImm:$SIMM), IIC_VMOVImm,
                          "vmov", "i32", "$dst, $SIMM", "",
                          [(set QPR:$dst, (v4i32 vmovImm32:$SIMM))]>;
 
 def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$dst),
-                         (ins h64imm:$SIMM), IIC_VMOVImm,
+                         (ins nModImm:$SIMM), IIC_VMOVImm,
                          "vmov", "i64", "$dst, $SIMM", "",
                          [(set DPR:$dst, (v1i64 vmovImm64:$SIMM))]>;
 def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$dst),
-                         (ins h64imm:$SIMM), IIC_VMOVImm,
+                         (ins nModImm:$SIMM), IIC_VMOVImm,
                          "vmov", "i64", "$dst, $SIMM", "",
                          [(set QPR:$dst, (v2i64 vmovImm64:$SIMM))]>;
 } // isReMaterializable
@@ -3122,17 +3105,6 @@ def  VDUPfqf  : N2V<0b11, 0b11, {?,1}, {0,0}, 0b11000, 1, 0,
                     IIC_VMOVD, "vdup", "32", "$dst, ${src:lane}", "",
                     [(set QPR:$dst, (v4f32 (NEONvdup (f32 SPR:$src))))]>;
 
-def : Pat<(v2i64 (NEONvduplane (v2i64 QPR:$src), imm:$lane)),
-          (INSERT_SUBREG QPR:$src, 
-                         (i64 (EXTRACT_SUBREG QPR:$src,
-                               (DSubReg_f64_reg imm:$lane))),
-                         (DSubReg_f64_other_reg imm:$lane))>;
-def : Pat<(v2f64 (NEONvduplane (v2f64 QPR:$src), imm:$lane)),
-          (INSERT_SUBREG QPR:$src, 
-                         (f64 (EXTRACT_SUBREG QPR:$src,
-                               (DSubReg_f64_reg imm:$lane))),
-                         (DSubReg_f64_other_reg imm:$lane))>;
-
 //   VMOVN    : Vector Narrowing Move
 defm VMOVN    : N2VNInt_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVD,
                             "vmovn", "i", int_arm_neon_vmovn>;
@@ -3319,22 +3291,16 @@ let hasExtraSrcRegAllocReq = 1 in {
 def  VTBL2
   : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$dst),
         (ins DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTB2,
-        "vtbl", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "",
-        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl2
-                               DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>;
+        "vtbl", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "", []>;
 def  VTBL3
   : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$dst),
         (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), NVTBLFrm, IIC_VTB3,
-        "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "",
-        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl3
-                               DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>;
+        "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "", []>;
 def  VTBL4
   : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$dst),
         (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src),
         NVTBLFrm, IIC_VTB4,
-        "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src", "",
-        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl4 DPR:$tbl1, DPR:$tbl2,
-                               DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>;
+        "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src", "", []>;
 } // hasExtraSrcRegAllocReq = 1
 
 //   VTBX     : Vector Table Extension
@@ -3348,23 +3314,18 @@ let hasExtraSrcRegAllocReq = 1 in {
 def  VTBX2
   : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$dst),
         (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src), NVTBLFrm, IIC_VTBX2,
-        "vtbx", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "$orig = $dst",
-        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx2
-                               DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>;
+        "vtbx", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "$orig = $dst", []>;
 def  VTBX3
   : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$dst),
         (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src),
         NVTBLFrm, IIC_VTBX3,
-        "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "$orig = $dst",
-        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx3 DPR:$orig, DPR:$tbl1,
-                               DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>;
+        "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src",
+        "$orig = $dst", []>;
 def  VTBX4
   : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$dst), (ins DPR:$orig, DPR:$tbl1,
         DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), NVTBLFrm, IIC_VTBX4,
         "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src",
-        "$orig = $dst",
-        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx4 DPR:$orig, DPR:$tbl1,
-                               DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>;
+        "$orig = $dst", []>;
 } // hasExtraSrcRegAllocReq = 1
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 40f924b679fa..bc0790dccbb5 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -894,11 +894,11 @@ def tLEApcrel : T1I<(outs tGPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi,
                     "adr$p\t$dst, #$label", []>,
                 T1Encoding<{1,0,1,0,0,?}>; // A6.2 & A8.6.10
 
+} // neverHasSideEffects
 def tLEApcrelJT : T1I<(outs tGPR:$dst),
                       (ins i32imm:$label, nohash_imm:$id, pred:$p),
                       IIC_iALUi, "adr$p\t$dst, #${label}_${id}", []>,
                   T1Encoding<{1,0,1,0,0,?}>; // A6.2 & A8.6.10
-} // neverHasSideEffects
 
 //===----------------------------------------------------------------------===//
 // TLS Instructions
@@ -923,18 +923,18 @@ let isCall = 1,
 //   except for our own input by listing the relevant registers in Defs. By
 //   doing so, we also cause the prologue/epilogue code to actively preserve
 //   all of the callee-saved resgisters, which is exactly what we want.
-//   The current SP is passed in $val, and we reuse the reg as a scratch.
+//   $val is a scratch register for our use.
 let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7, R12 ] in {
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7, R12 ], hasSideEffects = 1,
+   isBarrier = 1  in {
   def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val),
                               AddrModeNone, SizeSpecial, NoItinerary,
-                              "str\t$val, [$src, #8]\t${:comment} begin eh.setjmp\n"
-                              "\tmov\t$val, pc\n"
-                              "\tadds\t$val, #7\n"
-                              "\tstr\t$val, [$src, #4]\n"
-                              "\tmovs\tr0, #0\n"
-                              "\tb\t1f\n"
-                              "\tmovs\tr0, #1\t${:comment} end eh.setjmp\n"
+                              "mov\t$val, pc\t${:comment} begin eh.setjmp\n\t"
+                              "adds\t$val, #7\n\t"
+                              "str\t$val, [$src, #4]\n\t"
+                              "movs\tr0, #0\n\t"
+                              "b\t1f\n\t"
+                              "movs\tr0, #1\t${:comment} end eh.setjmp\n\t"
                               "1:", "",
                    [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>;
 }
@@ -1037,7 +1037,8 @@ def : T1Pat<(i32 imm0_255_comp:$src),
 // scheduling.
 let isReMaterializable = 1 in
 def tLDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp),
-                   NoItinerary, "${:comment} ldr.n\t$dst, $addr\n$cp:\n\tadd\t$dst, pc",
+                   NoItinerary,
+                   "${:comment} ldr.n\t$dst, $addr\n$cp:\n\tadd\t$dst, pc",
                [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
                                            imm:$cp))]>,
                Requires<[IsThumb1Only]>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index b91c089fa5db..4692f2a42133 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -637,8 +637,7 @@ multiclass T2I_st<bits<2> opcod, string opc, PatFrag opnode> {
 multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> {
   def r     : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
                   opc, ".w\t$dst, $src",
-                 [(set GPR:$dst, (opnode GPR:$src))]>,
-                 Requires<[HasT2ExtractPack]> {
+                 [(set GPR:$dst, (opnode GPR:$src))]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-20} = opcod;
@@ -649,8 +648,7 @@ multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> {
    }
   def r_rot : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$rot), IIC_iUNAsi,
                   opc, ".w\t$dst, $src, ror $rot",
-                 [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]>,
-                 Requires<[HasT2ExtractPack]> {
+                 [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-20} = opcod;
@@ -661,8 +659,8 @@ multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> {
    }
 }
 
-// SXTB16 and UXTB16 do not need the .w qualifier.
-multiclass T2I_unary_rrot_nw<bits<3> opcod, string opc, PatFrag opnode> {
+// UXTB16 - Requres T2ExtractPack, does not need the .w qualifier.
+multiclass T2I_unary_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode> {
   def r     : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
                   opc, "\t$dst, $src",
                  [(set GPR:$dst, (opnode GPR:$src))]>,
@@ -689,9 +687,9 @@ multiclass T2I_unary_rrot_nw<bits<3> opcod, string opc, PatFrag opnode> {
    }
 }
 
-// DO variant - disassembly only, no pattern
-
-multiclass T2I_unary_rrot_DO<bits<3> opcod, string opc> {
+// SXTB16 - Requres T2ExtractPack, does not need the .w qualifier, no pattern
+// supported yet.
+multiclass T2I_unary_rrot_sxtb16<bits<3> opcod, string opc> {
   def r     : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
                   opc, "\t$dst, $src", []> {
      let Inst{31-27} = 0b11111;
@@ -787,6 +785,7 @@ def t2LEApcrel : T2XI<(outs GPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi,
   let Inst{19-16} = 0b1111; // Rn
   let Inst{15} = 0;
 }
+} // neverHasSideEffects
 def t2LEApcrelJT : T2XI<(outs GPR:$dst),
                         (ins i32imm:$label, nohash_imm:$id, pred:$p), IIC_iALUi,
                         "adr$p.w\t$dst, #${label}_${id}", []> {
@@ -798,7 +797,6 @@ def t2LEApcrelJT : T2XI<(outs GPR:$dst),
   let Inst{19-16} = 0b1111; // Rn
   let Inst{15} = 0;
 }
-} // neverHasSideEffects
 
 // ADD r, sp, {so_imm|i12}
 def t2ADDrSPi   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm),
@@ -1330,7 +1328,7 @@ defm t2SXTB  : T2I_unary_rrot<0b100, "sxtb",
                               UnOpFrag<(sext_inreg node:$Src, i8)>>;
 defm t2SXTH  : T2I_unary_rrot<0b000, "sxth",
                               UnOpFrag<(sext_inreg node:$Src, i16)>>;
-defm t2SXTB16 : T2I_unary_rrot_DO<0b010, "sxtb16">;
+defm t2SXTB16 : T2I_unary_rrot_sxtb16<0b010, "sxtb16">;
 
 defm t2SXTAB : T2I_bin_rrot<0b100, "sxtab",
                         BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
@@ -1347,13 +1345,13 @@ defm t2UXTB   : T2I_unary_rrot<0b101, "uxtb",
                                UnOpFrag<(and node:$Src, 0x000000FF)>>;
 defm t2UXTH   : T2I_unary_rrot<0b001, "uxth",
                                UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
-defm t2UXTB16 : T2I_unary_rrot_nw<0b011, "uxtb16",
+defm t2UXTB16 : T2I_unary_rrot_uxtb16<0b011, "uxtb16",
                                UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
 
 def : T2Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF),
-            (t2UXTB16r_rot GPR:$Src, 24)>;
+            (t2UXTB16r_rot GPR:$Src, 24)>, Requires<[HasT2ExtractPack]>;
 def : T2Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF),
-            (t2UXTB16r_rot GPR:$Src, 8)>;
+            (t2UXTB16r_rot GPR:$Src, 8)>, Requires<[HasT2ExtractPack]>;
 
 defm t2UXTAB : T2I_bin_rrot<0b101, "uxtab",
                            BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
@@ -2389,37 +2387,36 @@ let isCall = 1,
 //   except for our own input by listing the relevant registers in Defs. By
 //   doing so, we also cause the prologue/epilogue code to actively preserve
 //   all of the callee-saved resgisters, which is exactly what we want.
-//   The current SP is passed in $val, and we reuse the reg as a scratch.
+//   $val is a scratch register for our use.
 let Defs =
   [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR,  D0,
     D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
     D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30,
-    D31 ] in {
+    D31 ], hasSideEffects = 1, isBarrier = 1 in {
   def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins GPR:$src, tGPR:$val),
                                AddrModeNone, SizeSpecial, NoItinerary,
-                               "str\t$val, [$src, #8]\t${:comment} begin eh.setjmp\n"
-                               "\tmov\t$val, pc\n"
-                               "\tadds\t$val, #7\n"
-                               "\tstr\t$val, [$src, #4]\n"
-                               "\tmovs\tr0, #0\n"
-                               "\tb\t1f\n"
-                               "\tmovs\tr0, #1\t${:comment} end eh.setjmp\n"
+                               "mov\t$val, pc\t${:comment} begin eh.setjmp\n\t"
+                               "adds\t$val, #7\n\t"
+                               "str\t$val, [$src, #4]\n\t"
+                               "movs\tr0, #0\n\t"
+                               "b\t1f\n\t"
+                               "movs\tr0, #1\t${:comment} end eh.setjmp\n\t"
                                "1:", "",
                           [(set R0, (ARMeh_sjlj_setjmp GPR:$src, tGPR:$val))]>,
                              Requires<[IsThumb2, HasVFP2]>;
 }
 
 let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR ] in {
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR ],
+  hasSideEffects = 1, isBarrier = 1 in {
   def t2Int_eh_sjlj_setjmp_nofp : Thumb2XI<(outs), (ins GPR:$src, tGPR:$val),
                                AddrModeNone, SizeSpecial, NoItinerary,
-                               "str\t$val, [$src, #8]\t${:comment} begin eh.setjmp\n"
-                               "\tmov\t$val, pc\n"
-                               "\tadds\t$val, #7\n"
-                               "\tstr\t$val, [$src, #4]\n"
-                               "\tmovs\tr0, #0\n"
-                               "\tb\t1f\n"
-                               "\tmovs\tr0, #1\t${:comment} end eh.setjmp\n"
+                               "mov\t$val, pc\t${:comment} begin eh.setjmp\n\t"
+                               "adds\t$val, #7\n\t"
+                               "str\t$val, [$src, #4]\n\t"
+                               "movs\tr0, #0\n\t"
+                               "b\t1f\n\t"
+                               "movs\tr0, #1\t${:comment} end eh.setjmp\n\t"
                                "1:", "",
                           [(set R0, (ARMeh_sjlj_setjmp GPR:$src, tGPR:$val))]>,
                                   Requires<[IsThumb2, NoVFP]>;
@@ -2529,6 +2526,7 @@ def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
 
 
 // IT block
+let Defs = [ITSTATE] in
 def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
                     AddrModeNone, Size2Bytes,  IIC_iALUx,
                     "it$mask\t$cc", "", []> {
@@ -2691,7 +2689,8 @@ def : T2Pat<(ARMWrapperJT tjumptable:$dst, imm:$id),
 // scheduling.
 let canFoldAsLoad = 1, isReMaterializable = 1 in
 def t2LDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp),
-                   NoItinerary, "${:comment} ldr.w\t$dst, $addr\n$cp:\n\tadd\t$dst, pc",
+                   NoItinerary,
+                   "${:comment} ldr.w\t$dst, $addr\n$cp:\n\tadd\t$dst, pc",
                [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
                                            imm:$cp))]>,
                Requires<[IsThumb2]>;
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 54474cfe2e29..84c23e1a784c 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -255,25 +255,25 @@ def VCVTSD : VFPAI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm,
 
 // Between half-precision and single-precision.  For disassembly only.
 
-def VCVTBSH : ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$dst), (ins SPR:$a),
+def VCVTBSH: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$dst), (ins SPR:$a),
                  /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$dst, $a",
                  [/* For disassembly only; pattern left blank */]>;
 
 def : ARMPat<(f32_to_f16 SPR:$a),
              (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
 
-def VCVTBHS : ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$dst), (ins SPR:$a),
+def VCVTBHS: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$dst), (ins SPR:$a),
                  /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$dst, $a",
                  [/* For disassembly only; pattern left blank */]>;
 
 def : ARMPat<(f16_to_f32 GPR:$a),
              (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
-def VCVTTSH : ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$dst), (ins SPR:$a),
+def VCVTTSH: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$dst), (ins SPR:$a),
                  /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$dst, $a",
                  [/* For disassembly only; pattern left blank */]>;
 
-def VCVTTHS : ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$dst), (ins SPR:$a),
+def VCVTTHS: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$dst), (ins SPR:$a),
                  /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$dst, $a",
                  [/* For disassembly only; pattern left blank */]>;
 
diff --git a/lib/Target/ARM/ARMJITInfo.h b/lib/Target/ARM/ARMJITInfo.h
index ff332b7ee15b..f5d9effeb9a5 100644
--- a/lib/Target/ARM/ARMJITInfo.h
+++ b/lib/Target/ARM/ARMJITInfo.h
@@ -143,7 +143,8 @@ namespace llvm {
       JumpTableId2AddrMap[JTI] = Addr;
     }
 
-    /// getPCLabelAddr - Retrieve the address of the PC label of the specified id.
+    /// getPCLabelAddr - Retrieve the address of the PC label of the
+    /// specified id.
     intptr_t getPCLabelAddr(unsigned Id) const {
       DenseMap<unsigned, intptr_t>::const_iterator I = PCLabelMap.find(Id);
       assert(I != PCLabelMap.end());
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 8585c1e50105..f80e316d23e8 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -74,11 +74,14 @@ namespace {
   private:
     struct MemOpQueueEntry {
       int Offset;
+      unsigned Reg;
+      bool isKill;
       unsigned Position;
       MachineBasicBlock::iterator MBBI;
       bool Merged;
-      MemOpQueueEntry(int o, int p, MachineBasicBlock::iterator i)
-        : Offset(o), Position(p), MBBI(i), Merged(false) {}
+      MemOpQueueEntry(int o, unsigned r, bool k, unsigned p, 
+                      MachineBasicBlock::iterator i)
+        : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {}
     };
     typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
     typedef MemOpQueue::iterator MemOpQueueIter;
@@ -128,30 +131,30 @@ namespace {
 static int getLoadStoreMultipleOpcode(int Opcode) {
   switch (Opcode) {
   case ARM::LDR:
-    NumLDMGened++;
+    ++NumLDMGened;
     return ARM::LDM;
   case ARM::STR:
-    NumSTMGened++;
+    ++NumSTMGened;
     return ARM::STM;
   case ARM::t2LDRi8:
   case ARM::t2LDRi12:
-    NumLDMGened++;
+    ++NumLDMGened;
     return ARM::t2LDM;
   case ARM::t2STRi8:
   case ARM::t2STRi12:
-    NumSTMGened++;
+    ++NumSTMGened;
     return ARM::t2STM;
   case ARM::VLDRS:
-    NumVLDMGened++;
+    ++NumVLDMGened;
     return ARM::VLDMS;
   case ARM::VSTRS:
-    NumVSTMGened++;
+    ++NumVSTMGened;
     return ARM::VSTMS;
   case ARM::VLDRD:
-    NumVLDMGened++;
+    ++NumVLDMGened;
     return ARM::VLDMD;
   case ARM::VSTRD:
-    NumVSTMGened++;
+    ++NumVSTMGened;
     return ARM::VSTMD;
   default: llvm_unreachable("Unhandled opcode!");
   }
@@ -264,45 +267,59 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
 
 // MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on
 // success.
-void ARMLoadStoreOpt::
-MergeOpsUpdate(MachineBasicBlock &MBB,
-               MemOpQueue &memOps,
-               unsigned memOpsBegin,
-               unsigned memOpsEnd,
-               unsigned insertAfter,
-               int Offset,
-               unsigned Base,
-               bool BaseKill,
-               int Opcode,
-               ARMCC::CondCodes Pred,
-               unsigned PredReg,
-               unsigned Scratch,
-               DebugLoc dl,
-               SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
+void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
+                                     MemOpQueue &memOps,
+                                     unsigned memOpsBegin, unsigned memOpsEnd,
+                                     unsigned insertAfter, int Offset,
+                                     unsigned Base, bool BaseKill,
+                                     int Opcode,
+                                     ARMCC::CondCodes Pred, unsigned PredReg,
+                                     unsigned Scratch,
+                                     DebugLoc dl,
+                          SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
   // First calculate which of the registers should be killed by the merged
   // instruction.
-  SmallVector<std::pair<unsigned, bool>, 8> Regs;
   const unsigned insertPos = memOps[insertAfter].Position;
+
+  SmallSet<unsigned, 4> UnavailRegs;
+  SmallSet<unsigned, 4> KilledRegs;
+  DenseMap<unsigned, unsigned> Killer;
+  for (unsigned i = 0; i < memOpsBegin; ++i) {
+    if (memOps[i].Position < insertPos && memOps[i].isKill) {
+      unsigned Reg = memOps[i].Reg;
+      if (memOps[i].Merged)
+        UnavailRegs.insert(Reg);
+      else {
+        KilledRegs.insert(Reg);
+        Killer[Reg] = i;
+      }
+    }
+  }
+  for (unsigned i = memOpsEnd, e = memOps.size(); i != e; ++i) {
+    if (memOps[i].Position < insertPos && memOps[i].isKill) {
+      unsigned Reg = memOps[i].Reg;
+      KilledRegs.insert(Reg);
+      Killer[Reg] = i;
+    }
+  }
+
+  SmallVector<std::pair<unsigned, bool>, 8> Regs;
   for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
-    const MachineOperand &MO = memOps[i].MBBI->getOperand(0);
-    unsigned Reg = MO.getReg();
-    bool isKill = MO.isKill();
+    unsigned Reg = memOps[i].Reg;
+    if (UnavailRegs.count(Reg))
+      // Register is killed before and it's not easy / possible to update the
+      // kill marker on already merged instructions. Abort.
+      return;
 
     // If we are inserting the merged operation after an unmerged operation that
     // uses the same register, make sure to transfer any kill flag.
-    for (unsigned j = memOpsEnd, e = memOps.size(); !isKill && j != e; ++j)
-      if (memOps[j].Position<insertPos) {
-        const MachineOperand &MOJ = memOps[j].MBBI->getOperand(0);
-        if (MOJ.getReg() == Reg && MOJ.isKill())
-          isKill = true;
-      }
-
+    bool isKill = memOps[i].isKill || KilledRegs.count(Reg);
     Regs.push_back(std::make_pair(Reg, isKill));
   }
 
   // Try to do the merge.
   MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI;
-  Loc++;
+  ++Loc;
   if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode,
                 Pred, PredReg, Scratch, dl, Regs))
     return;
@@ -311,13 +328,13 @@ MergeOpsUpdate(MachineBasicBlock &MBB,
   Merges.push_back(prior(Loc));
   for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
     // Remove kill flags from any unmerged memops that come before insertPos.
-    if (Regs[i-memOpsBegin].second)
-      for (unsigned j = memOpsEnd, e = memOps.size(); j != e; ++j)
-        if (memOps[j].Position<insertPos) {
-          MachineOperand &MOJ = memOps[j].MBBI->getOperand(0);
-          if (MOJ.getReg() == Regs[i-memOpsBegin].first && MOJ.isKill())
-            MOJ.setIsKill(false);
-        }
+    if (Regs[i-memOpsBegin].second) {
+      unsigned Reg = Regs[i-memOpsBegin].first;
+      if (KilledRegs.count(Reg)) {
+        unsigned j = Killer[Reg];
+        memOps[j].MBBI->getOperand(0).setIsKill(false);
+      }
+    }
     MBB.erase(memOps[i].MBBI);
     memOps[i].Merged = true;
   }
@@ -517,8 +534,11 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
   }
 
   // Try merging with the previous instruction.
-  if (MBBI != MBB.begin()) {
+  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
+  if (MBBI != BeginMBBI) {
     MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
+      --PrevMBBI;
     if (isAM4) {
       if (Mode == ARM_AM::ia &&
           isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
@@ -541,8 +561,11 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
   }
 
   // Try merging with the next instruction.
-  if (!DoMerge && MBBI != MBB.end()) {
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  if (!DoMerge && MBBI != EndMBBI) {
     MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
+    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+      ++NextMBBI;
     if (isAM4) {
       if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
           isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
@@ -669,8 +692,11 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
   unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
 
   // Try merging with the previous instruction.
-  if (MBBI != MBB.begin()) {
+  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
+  if (MBBI != BeginMBBI) {
     MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
+      --PrevMBBI;
     if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
       DoMerge = true;
       AddSub = ARM_AM::sub;
@@ -685,8 +711,11 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
   }
 
   // Try merging with the next instruction.
-  if (!DoMerge && MBBI != MBB.end()) {
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  if (!DoMerge && MBBI != EndMBBI) {
     MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
+    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+      ++NextMBBI;
     if (!isAM5 &&
         isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
       DoMerge = true;
@@ -759,18 +788,21 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
 /// isMemoryOp - Returns true if instruction is a memory operations (that this
 /// pass is capable of operating on).
 static bool isMemoryOp(const MachineInstr *MI) {
-  if (MI->hasOneMemOperand()) {
-    const MachineMemOperand *MMO = *MI->memoperands_begin();
+  // When no memory operands are present, conservatively assume unaligned,
+  // volatile, unfoldable.
+  if (!MI->hasOneMemOperand())
+    return false;
 
-    // Don't touch volatile memory accesses - we may be changing their order.
-    if (MMO->isVolatile())
-      return false;
+  const MachineMemOperand *MMO = *MI->memoperands_begin();
 
-    // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
-    // not.
-    if (MMO->getAlignment() < 4)
-      return false;
-  }
+  // Don't touch volatile memory accesses - we may be changing their order.
+  if (MMO->isVolatile())
+    return false;
+
+  // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
+  // not.
+  if (MMO->getAlignment() < 4)
+    return false;
 
   // str <undef> could probably be eliminated entirely, but for now we just want
   // to avoid making a mess of it.
@@ -898,6 +930,7 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
     if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum)
       return false;
 
+    MachineBasicBlock::iterator NewBBI = MBBI;
     bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
     bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
     bool EvenDeadKill = isLd ?
@@ -942,6 +975,7 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
                   getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef));
         ++NumSTRD2STM;
       }
+      NewBBI = llvm::prior(MBBI);
     } else {
       // Split into two instructions.
       assert((!isT2 || !OffReg) &&
@@ -962,14 +996,15 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
                       OddReg, OddDeadKill, false,
                       BaseReg, false, BaseUndef, OffReg, false, OffUndef,
                       Pred, PredReg, TII, isT2);
+        NewBBI = llvm::prior(MBBI);
         InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
                       EvenReg, EvenDeadKill, false,
                       BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
                       Pred, PredReg, TII, isT2);
       } else {
         if (OddReg == EvenReg && EvenDeadKill) {
-          // If the two source operands are the same, the kill marker is probably
-          // on the first one. e.g.
+          // If the two source operands are the same, the kill marker is
+          // probably on the first one. e.g.
           // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0
           EvenDeadKill = false;
           OddDeadKill = true;
@@ -978,6 +1013,7 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
                       EvenReg, EvenDeadKill, EvenUndef,
                       BaseReg, false, BaseUndef, OffReg, false, OffUndef,
                       Pred, PredReg, TII, isT2);
+        NewBBI = llvm::prior(MBBI);
         InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
                       OddReg, OddDeadKill, OddUndef,
                       BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
@@ -989,8 +1025,9 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
         ++NumSTRD2STR;
     }
 
-    MBBI = prior(MBBI);
     MBB.erase(MI);
+    MBBI = NewBBI;
+    return true;
   }
   return false;
 }
@@ -1023,6 +1060,9 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
     if (isMemOp) {
       int Opcode = MBBI->getOpcode();
       unsigned Size = getLSMultipleTransferSize(MBBI);
+      const MachineOperand &MO = MBBI->getOperand(0);
+      unsigned Reg = MO.getReg();
+      bool isKill = MO.isDef() ? false : MO.isKill();
       unsigned Base = MBBI->getOperand(1).getReg();
       unsigned PredReg = 0;
       ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg);
@@ -1044,8 +1084,8 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
         CurrSize = Size;
         CurrPred = Pred;
         CurrPredReg = PredReg;
-        MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
-        NumMemOps++;
+        MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI));
+        ++NumMemOps;
         Advance = true;
       } else {
         if (Clobber) {
@@ -1057,15 +1097,17 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
           // No need to match PredReg.
           // Continue adding to the queue.
           if (Offset > MemOps.back().Offset) {
-            MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
-            NumMemOps++;
+            MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill,
+                                             Position, MBBI));
+            ++NumMemOps;
             Advance = true;
           } else {
             for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end();
                  I != E; ++I) {
               if (Offset < I->Offset) {
-                MemOps.insert(I, MemOpQueueEntry(Offset, Position, MBBI));
-                NumMemOps++;
+                MemOps.insert(I, MemOpQueueEntry(Offset, Reg, isKill,
+                                                 Position, MBBI));
+                ++NumMemOps;
                 Advance = true;
                 break;
               } else if (Offset == I->Offset) {
@@ -1078,7 +1120,12 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
       }
     }
 
-    if (Advance) {
+    if (MBBI->isDebugValue()) {
+      ++MBBI;
+      if (MBBI == E)
+        // Reach the end of the block, try merging the memory instructions.
+        TryMerge = true;
+    } else if (Advance) {
       ++Position;
       ++MBBI;
       if (MBBI == E)
@@ -1279,7 +1326,7 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
   // some day.
   SmallSet<unsigned, 4> AddedRegPressure;
   while (++I != E) {
-    if (MemOps.count(&*I))
+    if (I->isDebugValue() || MemOps.count(&*I))
       continue;
     const TargetInstrDesc &TID = I->getDesc();
     if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects())
@@ -1411,7 +1458,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
   std::sort(Ops.begin(), Ops.end(), OffsetCompare());
 
   // The loads / stores of the same base are in order. Scan them from first to
-  // last and check for the followins:
+  // last and check for the following:
   // 1. Any def of base.
   // 2. Any gaps.
   while (Ops.size() > 1) {
@@ -1474,7 +1521,8 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
       } else {
         // This is the new location for the loads / stores.
         MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
-        while (InsertPos != MBB->end() && MemOps.count(InsertPos))
+        while (InsertPos != MBB->end()
+               && (MemOps.count(InsertPos) || InsertPos->isDebugValue()))
           ++InsertPos;
 
         // If we are moving a pair of loads / stores, see if it makes sense
@@ -1562,7 +1610,9 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
         break;
       }
 
-      MI2LocMap[MI] = Loc++;
+      if (!MI->isDebugValue())
+        MI2LocMap[MI] = ++Loc;
+
       if (!isMemoryOp(MI))
         continue;
       unsigned PredReg = 0;
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index 013427635592..7e57a1ca5576 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -88,6 +88,9 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
   int VarArgsFrameIndex;
 
+  /// HasITBlocks - True if IT blocks have been inserted.
+  bool HasITBlocks;
+
 public:
   ARMFunctionInfo() :
     isThumb(false),
@@ -97,7 +100,8 @@ public:
     FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
     GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
     GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0),
-    JumpTableUId(0), ConstPoolEntryUId(0), VarArgsFrameIndex(0) {}
+    JumpTableUId(0), ConstPoolEntryUId(0), VarArgsFrameIndex(0),
+    HasITBlocks(false) {}
 
   explicit ARMFunctionInfo(MachineFunction &MF) :
     isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()),
@@ -108,7 +112,8 @@ public:
     GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
     GPRCS1Frames(32), GPRCS2Frames(32), DPRCSFrames(32),
     SpilledCSRegs(MF.getTarget().getRegisterInfo()->getNumRegs()),
-    JumpTableUId(0), ConstPoolEntryUId(0), VarArgsFrameIndex(0) {}
+    JumpTableUId(0), ConstPoolEntryUId(0), VarArgsFrameIndex(0),
+    HasITBlocks(false) {}
 
   bool isThumbFunction() const { return isThumb; }
   bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; }
@@ -229,6 +234,9 @@ public:
 
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
   void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+  bool hasITBlocks() const { return HasITBlocks; }
+  void setHasITBlocks(bool h) { HasITBlocks = h; }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 6beca8b9199b..d020f3c74bde 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -153,11 +153,11 @@ def Q15 : ARMReg<15, "q15", [D30, D31]>;
 
 // Pseudo 256-bit registers to represent pairs of Q registers. These should
 // never be present in the emitted code.
-// These are used for NEON load / store instructions, e.g. vld4, vst3.
-// NOTE: It's possible to define more QQ registers since technical the
-// starting D register number doesn't have to be multiple of 4. e.g. 
-// D1, D2, D3, D4 would be a legal quad. But that would make the sub-register
-// stuffs very messy.
+// These are used for NEON load / store instructions, e.g., vld4, vst3.
+// NOTE: It's possible to define more QQ registers since technically the
+// starting D register number doesn't have to be multiple of 4, e.g.,
+// D1, D2, D3, D4 would be a legal quad, but that would make the subregister
+// stuff very messy.
 let SubRegIndices = [qsub_0, qsub_1] in {
 let CompositeIndices = [(dsub_2 qsub_1, dsub_0), (dsub_3 qsub_1, dsub_1),
                         (ssub_4 qsub_1, ssub_0), (ssub_5 qsub_1, ssub_1),
@@ -183,7 +183,8 @@ let CompositeIndices = [(qsub_2  qqsub_1, qsub_0), (qsub_3  qqsub_1, qsub_1),
                         (ssub_8  qqsub_1, ssub_0), (ssub_9  qqsub_1, ssub_1),
                         (ssub_10 qqsub_1, ssub_2), (ssub_11 qqsub_1, ssub_3),
                         (ssub_12 qqsub_1, ssub_4), (ssub_13 qqsub_1, ssub_5),
-                        (ssub_14 qqsub_1, ssub_6), (ssub_15 qqsub_1, ssub_7)] in {
+                        (ssub_14 qqsub_1, ssub_6), (ssub_15 qqsub_1, ssub_7)] in
+{
 def QQQQ0 : ARMReg<0, "qqqq0", [QQ0, QQ1]>;
 def QQQQ1 : ARMReg<1, "qqqq1", [QQ2, QQ3]>;
 }
@@ -196,9 +197,9 @@ def QQQQ3 : ARMReg<3, "qqqq3", [QQ6, QQ7]>;
 }
 
 // Current Program Status Register.
-def CPSR  : ARMReg<0, "cpsr">;
-
-def FPSCR : ARMReg<1, "fpscr">;
+def CPSR    : ARMReg<0, "cpsr">;
+def FPSCR   : ARMReg<1, "fpscr">;
+def ITSTATE : ARMReg<2, "itstate">;
 
 // Register classes.
 //
@@ -348,6 +349,73 @@ def tGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> {
   }];
 }
 
+// For tail calls, we can't use callee-saved registers, as they are restored
+// to the saved value before the tail call, which would clobber a call address.
+// Note, getMinimalPhysRegClass(R0) returns tGPR because of the names of
+// this class and the preceding one(!)  This is what we want.
+def tcGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R9, R12]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // R9 is available.
+    static const unsigned ARM_GPR_R9_TC[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+      ARM::R9, ARM::R12 };
+    // R9 is not available.
+    static const unsigned ARM_GPR_NOR9_TC[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+      ARM::R12 };
+
+    // For Thumb1 mode, we don't want to allocate hi regs at all, as we
+    // don't know how to spill them. If we make our prologue/epilogue code
+    // smarter at some point, we can go back to using the above allocation
+    // orders for the Thumb1 instructions that know how to use hi regs.
+    static const unsigned THUMB_GPR_AO_TC[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+
+    tcGPRClass::iterator
+    tcGPRClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      if (Subtarget.isThumb1Only())
+        return THUMB_GPR_AO_TC;
+      if (Subtarget.isTargetDarwin()) {
+        if (Subtarget.isR9Reserved())
+          return ARM_GPR_NOR9_TC;
+        else
+          return ARM_GPR_R9_TC;
+      } else
+        // R9 is either callee-saved or reserved; can't use it.
+        return ARM_GPR_NOR9_TC;
+    }
+
+    tcGPRClass::iterator
+    tcGPRClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      GPRClass::iterator I;
+
+      if (Subtarget.isThumb1Only()) {
+        I = THUMB_GPR_AO_TC + (sizeof(THUMB_GPR_AO_TC)/sizeof(unsigned));
+        return I;
+      }
+
+      if (Subtarget.isTargetDarwin()) {
+        if (Subtarget.isR9Reserved())
+          I = ARM_GPR_NOR9_TC + (sizeof(ARM_GPR_NOR9_TC)/sizeof(unsigned));
+        else
+          I = ARM_GPR_R9_TC + (sizeof(ARM_GPR_R9_TC)/sizeof(unsigned));
+      } else
+        // R9 is either callee-saved or reserved; can't use it.
+        I = ARM_GPR_NOR9_TC + (sizeof(ARM_GPR_NOR9_TC)/sizeof(unsigned));
+      return I;
+    }
+  }];
+}
+
+
 // Scalar single precision floating point register class..
 def SPR : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8,
   S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22,
@@ -479,4 +547,3 @@ def QQQQPR : RegisterClass<"ARM", [v8i64],
 
 // Condition code registers.
 def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>;
-
diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td
index bbfc0b2b4744..282abca98803 100644
--- a/lib/Target/ARM/ARMScheduleA8.td
+++ b/lib/Target/ARM/ARMScheduleA8.td
@@ -1,10 +1,10 @@
 //=- ARMScheduleA8.td - ARM Cortex-A8 Scheduling Definitions -*- tablegen -*-=//
-// 
+//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-// 
+//
 //===----------------------------------------------------------------------===//
 //
 // This file defines the itinerary class data for the ARM Cortex A8 processors.
@@ -32,50 +32,50 @@ def CortexA8Itineraries : ProcessorItineraries<
   InstrItinData<IIC_iALUx    , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>]>,
   //
   // Binary Instructions that produce a result
-  InstrItinData<IIC_iALUi    , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
-  InstrItinData<IIC_iALUr    , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>,
-  InstrItinData<IIC_iALUsi   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
-  InstrItinData<IIC_iALUsr   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>,
+  InstrItinData<IIC_iALUi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iALUr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>,
+  InstrItinData<IIC_iALUsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iALUsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>,
   //
   // Unary Instructions that produce a result
-  InstrItinData<IIC_iUNAr    , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
-  InstrItinData<IIC_iUNAsi   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
-  InstrItinData<IIC_iUNAsr   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iUNAr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iUNAsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iUNAsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
   //
   // Compare instructions
-  InstrItinData<IIC_iCMPi    , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
-  InstrItinData<IIC_iCMPr    , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
-  InstrItinData<IIC_iCMPsi   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
-  InstrItinData<IIC_iCMPsr   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iCMPi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+  InstrItinData<IIC_iCMPr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iCMPsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMPsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
   //
   // Move instructions, unconditional
-  InstrItinData<IIC_iMOVi    , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>,
-  InstrItinData<IIC_iMOVr    , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
-  InstrItinData<IIC_iMOVsi   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
-  InstrItinData<IIC_iMOVsr   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>,
+  InstrItinData<IIC_iMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>,
+  InstrItinData<IIC_iMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>,
   //
   // Move instructions, conditional
-  InstrItinData<IIC_iCMOVi   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
-  InstrItinData<IIC_iCMOVr   , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
-  InstrItinData<IIC_iCMOVsi  , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
-  InstrItinData<IIC_iCMOVsr  , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iCMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+  InstrItinData<IIC_iCMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
 
   // Integer multiply pipeline
   // Result written in E5, but that is relative to the last cycle of multicycle,
   // so we use 6 for those cases
   //
   InstrItinData<IIC_iMUL16   , [InstrStage<1, [A8_Pipe0]>], [5, 1, 1]>,
-  InstrItinData<IIC_iMAC16   , [InstrStage<1, [A8_Pipe1], 0>, 
+  InstrItinData<IIC_iMAC16   , [InstrStage<1, [A8_Pipe1], 0>,
                                 InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>,
-  InstrItinData<IIC_iMUL32   , [InstrStage<1, [A8_Pipe1], 0>, 
+  InstrItinData<IIC_iMUL32   , [InstrStage<1, [A8_Pipe1], 0>,
                                 InstrStage<2, [A8_Pipe0]>], [6, 1, 1]>,
-  InstrItinData<IIC_iMAC32   , [InstrStage<1, [A8_Pipe1], 0>, 
+  InstrItinData<IIC_iMAC32   , [InstrStage<1, [A8_Pipe1], 0>,
                                 InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>,
-  InstrItinData<IIC_iMUL64   , [InstrStage<2, [A8_Pipe1], 0>, 
+  InstrItinData<IIC_iMUL64   , [InstrStage<2, [A8_Pipe1], 0>,
                                 InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>,
-  InstrItinData<IIC_iMAC64   , [InstrStage<2, [A8_Pipe1], 0>, 
+  InstrItinData<IIC_iMAC64   , [InstrStage<2, [A8_Pipe1], 0>,
                                 InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>,
-  
+
   // Integer load pipeline
   //
   // loads have an extra cycle of latency, but are fully pipelined
@@ -166,7 +166,7 @@ def CortexA8Itineraries : ProcessorItineraries<
                                 InstrStage<2, [A8_Pipe1]>,
                                 InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                 InstrStage<1, [A8_LdSt0]>]>,
-  
+
   // Branch
   //
   // no delay slots, so the latency of a branch is unimportant
@@ -276,14 +276,14 @@ def CortexA8Itineraries : ProcessorItineraries<
   //
   // Single-precision FP Load
   // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpLoad32, [InstrStage<1, [A8_Issue], 0>, 
+  InstrItinData<IIC_fpLoad32, [InstrStage<1, [A8_Issue], 0>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_LdSt0], 0>,
                                InstrStage<1, [A8_NLSPipe]>]>,
   //
   // Double-precision FP Load
   // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpLoad64, [InstrStage<2, [A8_Issue], 0>, 
+  InstrItinData<IIC_fpLoad64, [InstrStage<2, [A8_Issue], 0>,
                                InstrStage<1, [A8_Pipe0], 0>,
                                InstrStage<1, [A8_Pipe1]>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
@@ -292,7 +292,7 @@ def CortexA8Itineraries : ProcessorItineraries<
   //
   // FP Load Multiple
   // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpLoadm,  [InstrStage<3, [A8_Issue], 0>, 
+  InstrItinData<IIC_fpLoadm,  [InstrStage<3, [A8_Issue], 0>,
                                InstrStage<2, [A8_Pipe0], 0>,
                                InstrStage<2, [A8_Pipe1]>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
@@ -301,14 +301,14 @@ def CortexA8Itineraries : ProcessorItineraries<
   //
   // Single-precision FP Store
   // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Issue], 0>, 
+  InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Issue], 0>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_LdSt0], 0>,
                                InstrStage<1, [A8_NLSPipe]>]>,
   //
   // Double-precision FP Store
   // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpStore64,[InstrStage<2, [A8_Issue], 0>, 
+  InstrItinData<IIC_fpStore64,[InstrStage<2, [A8_Issue], 0>,
                                InstrStage<1, [A8_Pipe0], 0>,
                                InstrStage<1, [A8_Pipe1]>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
@@ -317,7 +317,7 @@ def CortexA8Itineraries : ProcessorItineraries<
   //
   // FP Store Multiple
   // use A8_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpStorem, [InstrStage<3, [A8_Issue], 0>, 
+  InstrItinData<IIC_fpStorem, [InstrStage<3, [A8_Issue], 0>,
                                InstrStage<2, [A8_Pipe0], 0>,
                                InstrStage<2, [A8_Pipe1]>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
@@ -329,35 +329,35 @@ def CortexA8Itineraries : ProcessorItineraries<
   //
   // VLD1
   // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VLD1,     [InstrStage<1, [A8_Issue], 0>, 
+  InstrItinData<IIC_VLD1,     [InstrStage<1, [A8_Issue], 0>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_LdSt0], 0>,
                                InstrStage<1, [A8_NLSPipe]>]>,
   //
   // VLD2
   // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VLD2,     [InstrStage<1, [A8_Issue], 0>, 
+  InstrItinData<IIC_VLD2,     [InstrStage<1, [A8_Issue], 0>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_LdSt0], 0>,
                                InstrStage<1, [A8_NLSPipe]>], [2, 2, 1]>,
   //
   // VLD3
   // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VLD3,     [InstrStage<1, [A8_Issue], 0>, 
+  InstrItinData<IIC_VLD3,     [InstrStage<1, [A8_Issue], 0>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_LdSt0], 0>,
                                InstrStage<1, [A8_NLSPipe]>], [2, 2, 2, 1]>,
   //
   // VLD4
   // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VLD4,     [InstrStage<1, [A8_Issue], 0>, 
+  InstrItinData<IIC_VLD4,     [InstrStage<1, [A8_Issue], 0>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_LdSt0], 0>,
                                InstrStage<1, [A8_NLSPipe]>], [2, 2, 2, 2, 1]>,
   //
   // VST
   // FIXME: We don't model this instruction properly
-  InstrItinData<IIC_VST,      [InstrStage<1, [A8_Issue], 0>, 
+  InstrItinData<IIC_VST,      [InstrStage<1, [A8_Issue], 0>,
                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_LdSt0], 0>,
                                InstrStage<1, [A8_NLSPipe]>]>,
@@ -600,7 +600,7 @@ def CortexA8Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VTB4,     [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_NLSPipe]>,
                                InstrStage<1, [A8_NPipe], 0>,
-                               InstrStage<2, [A8_NLSPipe]>], [4, 2, 2, 3, 3, 1]>,
+                               InstrStage<2, [A8_NLSPipe]>],[4, 2, 2, 3, 3, 1]>,
   //
   // VTBX
   InstrItinData<IIC_VTBX1,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
@@ -610,9 +610,9 @@ def CortexA8Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VTBX3,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_NLSPipe]>,
                                InstrStage<1, [A8_NPipe], 0>,
-                               InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 1]>,
+                               InstrStage<2, [A8_NLSPipe]>],[4, 1, 2, 2, 3, 1]>,
   InstrItinData<IIC_VTBX4,    [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
                                InstrStage<1, [A8_NLSPipe]>,
                                InstrStage<1, [A8_NPipe], 0>,
-                               InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]>
+                            InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]>
 ]>;
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 75320d929952..df2f896a8d4b 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1,10 +1,10 @@
 //=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=//
-// 
+//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-// 
+//
 //===----------------------------------------------------------------------===//
 //
 // This file defines the itinerary class data for the ARM Cortex A9 processors.
@@ -16,7 +16,6 @@
 // Reference Manual".
 //
 // Functional units
-def A9_Issue   : FuncUnit; // issue
 def A9_Pipe0   : FuncUnit; // pipeline 0
 def A9_Pipe1   : FuncUnit; // pipeline 1
 def A9_LSPipe  : FuncUnit; // LS pipe
@@ -27,7 +26,121 @@ def A9_DRegsN  : FuncUnit; // FP register set, NEON side
 // Dual issue pipeline represented by A9_Pipe0 | A9_Pipe1
 //
 def CortexA9Itineraries : ProcessorItineraries<
-  [A9_NPipe, A9_DRegsN, A9_DRegsVFP, A9_LSPipe, A9_Pipe0, A9_Pipe1, A9_Issue], [
+  [A9_NPipe, A9_DRegsN, A9_DRegsVFP, A9_LSPipe, A9_Pipe0, A9_Pipe1], [
+  // Two fully-pipelined integer ALU pipelines
+  // FIXME: There are no operand latencies for these instructions at all!
+  //
+  // Move instructions, unconditional
+  InstrItinData<IIC_iMOVi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1]>,
+  InstrItinData<IIC_iMOVr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsi  , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsr  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1]>,
+  //
+  // No operand cycles
+  InstrItinData<IIC_iALUx    , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>,
+  //
+  // Binary Instructions that produce a result
+  InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2, 2]>,
+  InstrItinData<IIC_iALUsi, [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iALUsr,[InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 2, 1, 1]>,
+  //
+  // Unary Instructions that produce a result
+  InstrItinData<IIC_iUNAr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iUNAsi  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iUNAsr  , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
+  //
+  // Compare instructions
+  InstrItinData<IIC_iCMPi   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>,
+  InstrItinData<IIC_iCMPr   , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iCMPsi  , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMPsr  , [InstrStage<3, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
+  //
+  // Move instructions, conditional
+  InstrItinData<IIC_iCMOVi  , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2]>,
+  InstrItinData<IIC_iCMOVr  , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMOVsr , [InstrStage<2, [A9_Pipe0, A9_Pipe1]>], [2, 1, 1]>,
+
+  // Integer multiply pipeline
+  //
+  InstrItinData<IIC_iMUL16   , [InstrStage<1, [A9_Pipe1], 0>,
+                                InstrStage<2, [A9_Pipe0]>], [4, 1, 1]>,
+  InstrItinData<IIC_iMAC16   , [InstrStage<1, [A9_Pipe1], 0>,
+                                InstrStage<2, [A9_Pipe0]>], [4, 1, 1, 2]>,
+  InstrItinData<IIC_iMUL32   , [InstrStage<1, [A9_Pipe1], 0>,
+                                InstrStage<2, [A9_Pipe0]>], [4, 1, 1]>,
+  InstrItinData<IIC_iMAC32   , [InstrStage<1, [A9_Pipe1], 0>,
+                                InstrStage<2, [A9_Pipe0]>], [4, 1, 1, 2]>,
+  InstrItinData<IIC_iMUL64   , [InstrStage<2, [A9_Pipe1], 0>,
+                                InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>,
+  InstrItinData<IIC_iMAC64   , [InstrStage<2, [A9_Pipe1], 0>,
+                                InstrStage<3, [A9_Pipe0]>], [4, 5, 1, 1]>,
+  // Integer load pipeline
+  // FIXME: The timings are some rough approximations
+  //
+  // Immediate offset
+  InstrItinData<IIC_iLoadi   , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<1, [A9_LSPipe]>], [3, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iLoadr   , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<1, [A9_LSPipe]>], [3, 1, 1]>,
+  //
+  // Scaled register offset
+  InstrItinData<IIC_iLoadsi  , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<2, [A9_LSPipe]>], [4, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iLoadiu  , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<2, [A9_LSPipe]>], [3, 2, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iLoadru  , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<2, [A9_LSPipe]>], [3, 2, 1, 1]>,
+  //
+  // Scaled register offset with update
+  InstrItinData<IIC_iLoadsiu , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<2, [A9_LSPipe]>], [4, 3, 1, 1]>,
+  //
+  // Load multiple
+  InstrItinData<IIC_iLoadm   , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<1, [A9_LSPipe]>]>,
+
+  // Integer store pipeline
+  ///
+  // Immediate offset
+  InstrItinData<IIC_iStorei  , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<1, [A9_LSPipe]>], [3, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iStorer  , [InstrStage<1, [ A9_Pipe1]>,
+                                InstrStage<1, [A9_LSPipe]>], [3, 1, 1]>,
+  //
+  // Scaled register offset
+  InstrItinData<IIC_iStoresi , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<2, [A9_LSPipe]>], [3, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iStoreiu , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<1, [A9_LSPipe]>], [2, 3, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iStoreru , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<1, [A9_LSPipe]>], [2, 3, 1, 1]>,
+  //
+  // Scaled register offset with update
+  InstrItinData<IIC_iStoresiu, [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<2, [A9_LSPipe]>], [3, 3, 1, 1]>,
+  //
+  // Store multiple
+  InstrItinData<IIC_iStorem  , [InstrStage<1, [A9_Pipe1]>,
+                                InstrStage<1, [A9_LSPipe]>]>,
+  // Branch
+  //
+  // no delay slots, so the latency of a branch is unimportant
+  InstrItinData<IIC_Br       , [InstrStage<1, [A9_Pipe0, A9_Pipe1]>]>,
+
   // VFP and NEON shares the same register file. This means that every VFP
   // instruction should wait for full completion of the consecutive NEON
   // instruction and vice-versa. We model this behavior with two artificial FUs:
@@ -39,8 +152,8 @@ def CortexA9Itineraries : ProcessorItineraries<
   //    register file writeback!).
   // Every NEON instruction does the same but with FUs swapped.
   //
-  // Since the reserved FU cannot be acquired this models precisly "cross-domain"
-  // stalls.
+  // Since the reserved FU cannot be acquired, this models precisely
+  // "cross-domain" stalls.
 
   // VFP
   // Issue through integer pipeline, and execute in NEON unit.
@@ -48,21 +161,21 @@ def CortexA9Itineraries : ProcessorItineraries<
   // FP Special Register to Integer Register File Move
   InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                              InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                              InstrStage<1, [A9_Pipe1]>,
                               InstrStage<1, [A9_NPipe]>]>,
   //
   // Single-precision FP Unary
   InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra latency cycles since wbck is 2 cycles
                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [1, 1]>,
   //
   // Double-precision FP Unary
   InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra latency cycles since wbck is 2 cycles
                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [1, 1]>,
 
   //
@@ -70,124 +183,124 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra latency cycles since wbck is 4 cycles
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [1, 1]>,
   //
   // Double-precision FP Compare
   InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra latency cycles since wbck is 4 cycles
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [1, 1]>,
   //
   // Single to Double FP Convert
   InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
   //
   // Double to Single FP Convert
   InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
 
   //
   // Single to Half FP Convert
   InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
   //
   // Half to Single FP Convert
   InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [2, 1]>,
 
   //
   // Single-Precision FP to Integer Convert
   InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
   //
   // Double-Precision FP to Integer Convert
   InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
   //
   // Integer to Single-Precision FP Convert
   InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
   //
   // Integer to Double-Precision FP Convert
   InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
   //
   // Single-precision FP ALU
   InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
   //
   // Double-precision FP ALU
   InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<5, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
   //
   // Single-precision FP Multiply
   InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<6, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [5, 1, 1]>,
   //
   // Double-precision FP Multiply
   InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<7, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [6, 1, 1]>,
   //
   // Single-precision FP MAC
   InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<9, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [8, 0, 1, 1]>,
   //
   // Double-precision FP MAC
   InstrItinData<IIC_fpMAC64 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
                                InstrStage<10, [A9_DRegsN],  0, Reserved>,
-                               InstrStage<1,  [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1,  [A9_Pipe1]>,
                                InstrStage<2,  [A9_NPipe]>], [9, 0, 1, 1]>,
   //
   // Single-precision FP DIV
   InstrItinData<IIC_fpDIV32 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
                                InstrStage<16, [A9_DRegsN],  0, Reserved>,
-                               InstrStage<1,  [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1,  [A9_Pipe1]>,
                                InstrStage<10, [A9_NPipe]>], [15, 1, 1]>,
   //
   // Double-precision FP DIV
   InstrItinData<IIC_fpDIV64 , [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
                                InstrStage<26, [A9_DRegsN],  0, Reserved>,
-                               InstrStage<1,  [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1,  [A9_Pipe1]>,
                                InstrStage<20, [A9_NPipe]>], [25, 1, 1]>,
   //
   // Single-precision FP SQRT
   InstrItinData<IIC_fpSQRT32, [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
                                InstrStage<18, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1,   [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<13,  [A9_NPipe]>], [17, 1]>,
+                               InstrStage<1,  [A9_Pipe1]>,
+                               InstrStage<13, [A9_NPipe]>], [17, 1]>,
   //
   // Double-precision FP SQRT
   InstrItinData<IIC_fpSQRT64, [InstrStage<1,  [A9_DRegsVFP], 0, Required>,
                                InstrStage<33, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1,  [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1,  [A9_Pipe1]>,
                                InstrStage<28, [A9_NPipe]>], [32, 1]>,
 
   //
@@ -195,92 +308,79 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra 1 latency cycle since wbck is 2 cycles
                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [1, 1]>,
   //
   // Integer to Double-precision Move
   InstrItinData<IIC_fpMOVID,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                // Extra 1 latency cycle since wbck is 2 cycles
                                InstrStage<3, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [1, 1, 1]>,
   //
   // Single-precision to Integer Move
   InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [1, 1]>,
   //
   // Double-precision to Integer Move
   InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [1, 1, 1]>,
   //
   // Single-precision FP Load
-  // use A9_Issue to enforce the 1 load/store per cycle limit
   InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>]>,
   //
   // Double-precision FP Load
-  // use A9_Issue to enforce the 1 load/store per cycle limit
   InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>]>,
   //
   // FP Load Multiple
-  // use A9_Issue to enforce the 1 load/store per cycle limit
   InstrItinData<IIC_fpLoadm,  [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>]>,
   //
   // Single-precision FP Store
-  // use A9_Issue to enforce the 1 load/store per cycle limit
   InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>]>,
   //
   // Double-precision FP Store
-  // use A9_Issue to enforce the 1 load/store per cycle limit
   InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>]>,
   //
   // FP Store Multiple
-  // use A9_Issue to enforce the 1 load/store per cycle limit
   InstrItinData<IIC_fpStorem, [InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>]>,
   // NEON
   // Issue through integer pipeline, and execute in NEON unit.
-  // FIXME: Neon pipeline and LdSt unit are multiplexed. 
+  // FIXME: Neon pipeline and LdSt unit are multiplexed.
   //        Add some syntactic sugar to model this!
   // VLD1
   // FIXME: We don't model this instruction properly
   InstrItinData<IIC_VLD1,     [InstrStage<1, [A9_DRegsN],   0, Required>,
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>]>,
   //
   // VLD2
@@ -288,9 +388,8 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VLD2,     [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>], [2, 2, 1]>,
   //
   // VLD3
@@ -298,9 +397,8 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VLD3,     [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>], [2, 2, 2, 1]>,
   //
   // VLD4
@@ -308,9 +406,8 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VLD4,     [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>], [2, 2, 2, 2, 1]>,
   //
   // VST
@@ -318,121 +415,120 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VST,      [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Issue], 0>, 
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<1, [A9_LSPipe], 0>,
+                               InstrStage<1, [A9_Pipe1], 0>,
+                               InstrStage<1, [A9_LSPipe]>,
                                InstrStage<1, [A9_NPipe]>]>,
   //
   // Double-register Integer Unary
   InstrItinData<IIC_VUNAiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 2]>,
   //
   // Quad-register Integer Unary
   InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 2]>,
   //
   // Double-register Integer Q-Unary
   InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
   //
   // Quad-register Integer CountQ-Unary
   InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1]>,
   //
   // Double-register Integer Binary
   InstrItinData<IIC_VBINiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [3, 2, 2]>,
   //
   // Quad-register Integer Binary
   InstrItinData<IIC_VBINiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [3, 2, 2]>,
   //
   // Double-register Integer Subtract
   InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [3, 2, 1]>,
   //
   // Quad-register Integer Subtract
   InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [3, 2, 1]>,
   //
   // Double-register Integer Shift
   InstrItinData<IIC_VSHLiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [3, 1, 1]>,
   //
   // Quad-register Integer Shift
   InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [3, 1, 1]>,
   //
   // Double-register Integer Shift (4 cycle)
   InstrItinData<IIC_VSHLi4D,  [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
   //
   // Quad-register Integer Shift (4 cycle)
   InstrItinData<IIC_VSHLi4Q,  [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 1, 1]>,
   //
   // Double-register Integer Binary (4 cycle)
   InstrItinData<IIC_VBINi4D,  [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 2, 2]>,
   //
   // Quad-register Integer Binary (4 cycle)
   InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 2, 2]>,
   //
   // Double-register Integer Subtract (4 cycle)
   InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 2, 1]>,
   //
   // Quad-register Integer Subtract (4 cycle)
   InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [4, 2, 1]>,
 
   //
@@ -440,7 +536,7 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VCNTiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [3, 2, 2]>,
   //
   // Quad-register Integer Count
@@ -449,35 +545,35 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [4, 2, 2]>,
   //
   // Double-register Absolute Difference and Accumulate
   InstrItinData<IIC_VABAD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [6, 3, 2, 1]>,
   //
   // Quad-register Absolute Difference and Accumulate
   InstrItinData<IIC_VABAQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>,
   //
   // Double-register Integer Pair Add Long
   InstrItinData<IIC_VPALiD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [6, 3, 1]>,
   //
   // Quad-register Integer Pair Add Long
   InstrItinData<IIC_VPALiQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [6, 3, 1]>,
 
   //
@@ -485,14 +581,14 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [6, 2, 2]>,
   //
   // Quad-register Integer Multiply (.8, .16)
   InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [7, 2, 2]>,
 
   //
@@ -500,56 +596,56 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [7, 2, 1]>,
   //
   // Quad-register Integer Multiply (.32)
   InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 9 cycles
                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<4, [A9_NPipe]>], [9, 2, 1]>,
   //
   // Double-register Integer Multiply-Accumulate (.8, .16)
   InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [6, 3, 2, 2]>,
   //
   // Double-register Integer Multiply-Accumulate (.32)
   InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [7, 3, 2, 1]>,
   //
   // Quad-register Integer Multiply-Accumulate (.8, .16)
   InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [7, 3, 2, 2]>,
   //
   // Quad-register Integer Multiply-Accumulate (.32)
   InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 9 cycles
                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<4, [A9_NPipe]>], [9, 3, 2, 1]>,
   //
   // Move Immediate
   InstrItinData<IIC_VMOVImm,  [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [3]>,
   //
   // Double-register Permute Move
   InstrItinData<IIC_VMOVD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
   // FIXME: all latencies are arbitrary, no information is available
                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_LSPipe]>], [2, 1]>,
   //
   // Quad-register Permute Move
@@ -558,42 +654,42 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VMOVQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
   // FIXME: all latencies are arbitrary, no information is available
                                InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [3, 1]>,
   //
   // Integer to Single-precision Move
   InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
   // FIXME: all latencies are arbitrary, no information is available
                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [2, 1]>,
   //
   // Integer to Double-precision Move
   InstrItinData<IIC_VMOVID ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
   // FIXME: all latencies are arbitrary, no information is available
                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [2, 1, 1]>,
   //
   // Single-precision to Integer Move
   InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
   // FIXME: all latencies are arbitrary, no information is available
                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [2, 1]>,
   //
   // Double-precision to Integer Move
   InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [A9_DRegsN],   0, Required>,
   // FIXME: all latencies are arbitrary, no information is available
                                InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [2, 2, 1]>,
   //
   // Integer to Lane Move
   InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_DRegsN],   0, Required>,
   // FIXME: all latencies are arbitrary, no information is available
                                InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [3, 1, 1]>,
 
   //
@@ -601,7 +697,7 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VUNAD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [5, 2]>,
   //
   // Quad-register FP Unary
@@ -610,7 +706,7 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VUNAQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [6, 2]>,
   //
   // Double-register FP Binary
@@ -619,7 +715,7 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VBIND,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [5, 2, 2]>,
   //
   // Quad-register FP Binary
@@ -630,14 +726,14 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VBINQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [6, 2, 2]>,
   //
   // Double-register FP Multiple-Accumulate
   InstrItinData<IIC_VMACD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [6, 3, 2, 1]>,
   //
   // Quad-register FP Multiple-Accumulate
@@ -646,28 +742,28 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VMACQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 9 cycles
                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<4, [A9_NPipe]>], [8, 4, 2, 1]>,
   //
   // Double-register Reciprical Step
   InstrItinData<IIC_VRECSD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [6, 2, 2]>,
   //
   // Quad-register Reciprical Step
   InstrItinData<IIC_VRECSQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 9 cycles
                                InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<4, [A9_NPipe]>], [8, 2, 2]>,
   //
   // Double-register Permute
   InstrItinData<IIC_VPERMD,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 6 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [2, 2, 1, 1]>,
   //
   // Quad-register Permute
@@ -676,7 +772,7 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VPERMQ,   [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [3, 3, 1, 1]>,
   //
   // Quad-register Permute (3 cycle issue)
@@ -685,7 +781,7 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<3, [A9_LSPipe]>], [4, 4, 1, 1]>,
 
   //
@@ -693,57 +789,57 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VEXTD,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<1, [A9_NPipe]>], [2, 1, 1]>,
   //
   // Quad-register VEXT
   InstrItinData<IIC_VEXTQ,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 9 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [3, 1, 1]>,
   //
   // VTB
   InstrItinData<IIC_VTB1,     [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [3, 2, 1]>,
   InstrItinData<IIC_VTB2,     [InstrStage<2, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [3, 2, 2, 1]>,
   InstrItinData<IIC_VTB3,     [InstrStage<2, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 1]>,
   InstrItinData<IIC_VTB4,     [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<3, [A9_NPipe]>], [4, 2, 2, 3, 3, 1]>,
   //
   // VTBX
   InstrItinData<IIC_VTBX1,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [3, 1, 2, 1]>,
   InstrItinData<IIC_VTBX2,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
                                InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<2, [A9_NPipe]>], [3, 1, 2, 2, 1]>,
   InstrItinData<IIC_VTBX3,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
+                               InstrStage<1, [A9_Pipe1]>,
                                InstrStage<3, [A9_NPipe]>], [4, 1, 2, 2, 3, 1]>,
   InstrItinData<IIC_VTBX4,    [InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 8 cycles
                                InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<1, [A9_Pipe0, A9_Pipe1]>,
-                               InstrStage<2, [A9_NPipe]>], [4, 1, 2, 2, 3, 3, 1]>
+                               InstrStage<1, [A9_Pipe1]>,
+                              InstrStage<2, [A9_NPipe]>], [4, 1, 2, 2, 3, 3, 1]>
 ]>;
diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td
index f813022d8741..08b560cc0c2f 100644
--- a/lib/Target/ARM/ARMScheduleV6.td
+++ b/lib/Target/ARM/ARMScheduleV6.td
@@ -16,7 +16,7 @@
 // Functional Units
 def V6_Pipe : FuncUnit; // pipeline
 
-// Scheduling information derived from "ARM1176JZF-S Technical Reference Manual".
+// Scheduling information derived from "ARM1176JZF-S Technical Reference Manual"
 //
 def ARMV6Itineraries : ProcessorItineraries<
   [V6_Pipe], [
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index b4a9252909bc..09203f9304df 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -60,8 +60,10 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, const std::string &TT,
                                    const std::string &FS)
   : ARMBaseTargetMachine(T, TT, FS, false), InstrInfo(Subtarget),
     DataLayout(Subtarget.isAPCS_ABI() ?
-               std::string("e-p:32:32-f64:32:32-i64:32:32-n32") :
-               std::string("e-p:32:32-f64:64:64-i64:64:64-n32")),
+               std::string("e-p:32:32-f64:32:32-i64:32:32-"
+                           "v128:32:128-v64:32:64-n32") :
+               std::string("e-p:32:32-f64:64:64-i64:64:64-"
+                           "v128:64:128-v64:64:64-n32")),
     TLInfo(*this),
     TSInfo(*this) {
 }
@@ -74,9 +76,11 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, const std::string &TT,
               : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))),
     DataLayout(Subtarget.isAPCS_ABI() ?
                std::string("e-p:32:32-f64:32:32-i64:32:32-"
-                           "i16:16:32-i8:8:32-i1:8:32-a:0:32-n32") :
+                           "i16:16:32-i8:8:32-i1:8:32-"
+                           "v128:32:128-v64:32:64-a:0:32-n32") :
                std::string("e-p:32:32-f64:64:64-i64:64:64-"
-                           "i16:16:32-i8:8:32-i1:8:32-a:0:32-n32")),
+                           "i16:16:32-i8:8:32-i1:8:32-"
+                           "v128:64:128-v64:64:64-a:0:32-n32")),
     TLInfo(*this),
     TSInfo(*this) {
 }
@@ -98,6 +102,7 @@ bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM,
   // FIXME: temporarily disabling load / store optimization pass for Thumb1.
   if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
     PM.add(createARMLoadStoreOptimizationPass(true));
+
   return true;
 }
 
@@ -115,21 +120,20 @@ bool ARMBaseTargetMachine::addPreSched2(PassManagerBase &PM,
   // proper scheduling.
   PM.add(createARMExpandPseudoPass());
 
-  return true;
-}
-
-bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM,
-                                          CodeGenOpt::Level OptLevel) {
-  // FIXME: temporarily disabling load / store optimization pass for Thumb1.
   if (OptLevel != CodeGenOpt::None) {
     if (!Subtarget.isThumb1Only())
       PM.add(createIfConverterPass());
   }
-
-  if (Subtarget.isThumb2()) {
+  if (Subtarget.isThumb2())
     PM.add(createThumb2ITBlockPass());
+
+  return true;
+}
+
+bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM,
+                                          CodeGenOpt::Level OptLevel) {
+  if (Subtarget.isThumb2())
     PM.add(createThumb2SizeReductionPass());
-  }
 
   PM.add(createARMConstantIslandPass());
   return true;
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index bfa89c4a4696..8415d1ad8827 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -425,7 +425,7 @@ bool ARMAsmParser::ParseMemory(OwningPtr<ARMOperand> &Op) {
     const AsmToken &NextTok = Parser.getTok();
     if (NextTok.isNot(AsmToken::EndOfStatement)) {
       if (NextTok.isNot(AsmToken::Comma))
-	return Error(NextTok.getLoc(), "',' expected");
+        return Error(NextTok.getLoc(), "',' expected");
       Parser.Lex(); // Eat comma token.
       if(ParseMemoryOffsetReg(Negative, OffsetRegShifted, ShiftType,
                               ShiftAmount, Offset, OffsetIsReg, OffsetRegNum, 
@@ -488,7 +488,7 @@ bool ARMAsmParser::ParseMemoryOffsetReg(bool &Negative,
 
       const AsmToken &Tok = Parser.getTok();
       if (ParseShift(ShiftType, ShiftAmount, E))
-	return Error(Tok.getLoc(), "shift expected");
+        return Error(Tok.getLoc(), "shift expected");
       OffsetRegShifted = true;
     }
   }
@@ -665,7 +665,6 @@ bool ARMAsmParser::ParseInstruction(const StringRef &Name, SMLoc NameLoc,
   
   Operands.push_back(Op.take());
 
-  SMLoc Loc = Parser.getTok().getLoc();
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
 
     // Read the first operand.
@@ -763,15 +762,10 @@ bool ARMAsmParser::ParseDirectiveSyntax(SMLoc L) {
   if (Tok.isNot(AsmToken::Identifier))
     return Error(L, "unexpected token in .syntax directive");
   const StringRef &Mode = Tok.getString();
-  bool unified_syntax;
-  if (Mode == "unified" || Mode == "UNIFIED") {
+  if (Mode == "unified" || Mode == "UNIFIED")
     Parser.Lex();
-    unified_syntax = true;
-  }
-  else if (Mode == "divided" || Mode == "DIVIDED") {
+  else if (Mode == "divided" || Mode == "DIVIDED")
     Parser.Lex();
-    unified_syntax = false;
-  }
   else
     return Error(L, "unrecognized syntax mode in .syntax directive");
 
@@ -791,15 +785,10 @@ bool ARMAsmParser::ParseDirectiveCode(SMLoc L) {
   if (Tok.isNot(AsmToken::Integer))
     return Error(L, "unexpected token in .code directive");
   int64_t Val = Parser.getTok().getIntVal();
-  bool thumb_mode;
-  if (Val == 16) {
+  if (Val == 16)
     Parser.Lex();
-    thumb_mode = true;
-  }
-  else if (Val == 32) {
+  else if (Val == 32)
     Parser.Lex();
-    thumb_mode = false;
-  }
   else
     return Error(L, "invalid operand to .code directive");
 
diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
index d95efdb80943..6a40cf3602e9 100644
--- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
@@ -175,23 +175,8 @@ namespace {
                                raw_ostream &O);
     void printVFPf64ImmOperand(const MachineInstr *MI, int OpNum,
                                raw_ostream &O);
-
-    void printHex8ImmOperand(const MachineInstr *MI, int OpNum,
-                             raw_ostream &O) {
-      O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xff);
-    }
-    void printHex16ImmOperand(const MachineInstr *MI, int OpNum,
-                              raw_ostream &O) {
-      O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xffff);
-    }
-    void printHex32ImmOperand(const MachineInstr *MI, int OpNum,
-                              raw_ostream &O) {
-      O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xffffffff);
-    }
-    void printHex64ImmOperand(const MachineInstr *MI, int OpNum,
-                              raw_ostream &O) {
-      O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm());
-    }
+    void printNEONModImmOperand(const MachineInstr *MI, int OpNum,
+                                raw_ostream &O);
 
     virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                                  unsigned AsmVariant, const char *ExtraCode,
@@ -322,7 +307,7 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
       unsigned DRegLo = TM.getRegisterInfo()->getSubReg(Reg, ARM::dsub_0);
       unsigned DRegHi = TM.getRegisterInfo()->getSubReg(Reg, ARM::dsub_1);
       O << '{'
-        << getRegisterName(DRegLo) << ',' << getRegisterName(DRegHi)
+        << getRegisterName(DRegLo) << ", " << getRegisterName(DRegHi)
         << '}';
     } else if (Modifier && strcmp(Modifier, "lane") == 0) {
       unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
@@ -617,8 +602,12 @@ void ARMAsmPrinter::printAddrMode6Operand(const MachineInstr *MI, int Op,
 
   O << "[" << getRegisterName(MO1.getReg());
   if (MO2.getImm()) {
+    unsigned Align = MO2.getImm();
+    assert((Align == 8 || Align == 16 || Align == 32) &&
+           "unexpected NEON load/store alignment");
+    Align <<= 3;
     // FIXME: Both darwin as and GNU as violate ARM docs here.
-    O << ", :" << MO2.getImm();
+    O << ", :" << Align;
   }
   O << "]";
 }
@@ -1039,6 +1028,14 @@ void ARMAsmPrinter::printVFPf64ImmOperand(const MachineInstr *MI, int OpNum,
   }
 }
 
+void ARMAsmPrinter::printNEONModImmOperand(const MachineInstr *MI, int OpNum,
+                                           raw_ostream &O) {
+  unsigned EncodedImm = MI->getOperand(OpNum).getImm();
+  unsigned EltBits;
+  uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits);
+  O << "#0x" << utohexstr(Val);
+}
+
 bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                                     unsigned AsmVariant, const char *ExtraCode,
                                     raw_ostream &O) {
@@ -1064,20 +1061,10 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       printOperand(MI, OpNum, O);
       return false;
     case 'Q':
-      if (TM.getTargetData()->isLittleEndian())
-        break;
-      // Fallthrough
     case 'R':
-      if (TM.getTargetData()->isBigEndian())
-        break;
-      // Fallthrough
-    case 'H': // Write second word of DI / DF reference.
-      // Verify that this operand has two consecutive registers.
-      if (!MI->getOperand(OpNum).isReg() ||
-          OpNum+1 == MI->getNumOperands() ||
-          !MI->getOperand(OpNum+1).isReg())
-        return true;
-      ++OpNum;   // Return the high-part.
+    case 'H':
+      report_fatal_error("llvm does not support 'Q', 'R', and 'H' modifiers!");
+      return true;
     }
   }
 
@@ -1384,11 +1371,11 @@ void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) {
     } else if (MO.isGlobal()) {
       MCSymbol *Symbol = MCInstLowering.GetGlobalAddressSymbol(MO);
       const MCSymbolRefExpr *SymRef1 =
-	MCSymbolRefExpr::Create(Symbol,
-				MCSymbolRefExpr::VK_ARM_LO16, OutContext);
+        MCSymbolRefExpr::Create(Symbol,
+                                MCSymbolRefExpr::VK_ARM_LO16, OutContext);
       const MCSymbolRefExpr *SymRef2 =
-	MCSymbolRefExpr::Create(Symbol,
-				MCSymbolRefExpr::VK_ARM_HI16, OutContext);
+        MCSymbolRefExpr::Create(Symbol,
+                                MCSymbolRefExpr::VK_ARM_HI16, OutContext);
       V1 = MCOperand::CreateExpr(SymRef1);
       V2 = MCOperand::CreateExpr(SymRef2);
     } else {
diff --git a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp
index 2b94b76087e7..170819ad4f06 100644
--- a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp
@@ -779,22 +779,10 @@ void ARMInstPrinter::printVFPf64ImmOperand(const MCInst *MI, unsigned OpNum,
   O << '#' << MI->getOperand(OpNum).getImm();
 }
 
-void ARMInstPrinter::printHex8ImmOperand(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xff);
-}
-
-void ARMInstPrinter::printHex16ImmOperand(const MCInst *MI, unsigned OpNum,
-                                          raw_ostream &O) {
-  O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xffff);
-}
-
-void ARMInstPrinter::printHex32ImmOperand(const MCInst *MI, unsigned OpNum,
-                                          raw_ostream &O) {
-  O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xffffffff);
-}
-
-void ARMInstPrinter::printHex64ImmOperand(const MCInst *MI, unsigned OpNum,
-                                          raw_ostream &O) {
-  O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm());
+void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+  unsigned EncodedImm = MI->getOperand(OpNum).getImm();
+  unsigned EltBits;
+  uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits);
+  O << "#0x" << utohexstr(Val);
 }
diff --git a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h
index be0b7c1eb027..ddf5047793d2 100644
--- a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h
@@ -104,10 +104,7 @@ public:
   void printNoHashImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printVFPf32ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printVFPf64ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printHex8ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printHex16ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printHex32ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printHex64ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printNEONModImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
   void printPCLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);  
   // FIXME: Implement.
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 29e66e13b168..0df34666b959 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -33,6 +33,7 @@ add_llvm_target(ARMCodeGen
   NEONPreAllocPass.cpp
   Thumb1InstrInfo.cpp
   Thumb1RegisterInfo.cpp
+  Thumb2HazardRecognizer.cpp
   Thumb2ITBlockPass.cpp
   Thumb2InstrInfo.cpp
   Thumb2RegisterInfo.cpp
diff --git a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
index adb7795a746c..a07ff2832aa7 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
@@ -34,7 +34,7 @@
 /// Uses and Defs by this instr.  For the Uses part, the pred:$p operand is
 /// defined with two components:
 ///
-/// def pred {	// Operand PredicateOperand
+/// def pred { // Operand PredicateOperand
 ///   ValueType Type = OtherVT;
 ///   string PrintMethod = "printPredicateOperand";
 ///   string AsmOperandLowerMethod = ?;
@@ -54,7 +54,7 @@
 ///
 /// For the Defs part, in the simple case of only cc_out:$s, we have:
 ///
-/// def cc_out {	// Operand OptionalDefOperand
+/// def cc_out { // Operand OptionalDefOperand
 ///   ValueType Type = OtherVT;
 ///   string PrintMethod = "printSBitModifierOperand";
 ///   string AsmOperandLowerMethod = ?;
@@ -765,7 +765,7 @@ static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
           || Opcode == ARM::SMC || Opcode == ARM::SVC) &&
          "Unexpected Opcode");
 
-  assert(NumOps >= 1 && OpInfo[0].RegClass == 0 && "Reg operand expected");
+  assert(NumOps >= 1 && OpInfo[0].RegClass < 0 && "Reg operand expected");
 
   int Imm32 = 0;
   if (Opcode == ARM::SMC) {
@@ -1106,7 +1106,7 @@ static bool DisassembleDPSoRegFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   assert((OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) &&
          (OpInfo[OpIdx+1].RegClass == ARM::GPRRegClassID) &&
-         (OpInfo[OpIdx+2].RegClass == 0) &&
+         (OpInfo[OpIdx+2].RegClass < 0) &&
          "Expect 3 reg operands");
 
   // Register-controlled shifts have Inst{7} = 0 and Inst{4} = 1.
@@ -1201,7 +1201,7 @@ static bool DisassembleLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     return false;
 
   assert((OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) &&
-         (OpInfo[OpIdx+1].RegClass == 0) &&
+         (OpInfo[OpIdx+1].RegClass < 0) &&
          "Expect 1 reg operand followed by 1 imm operand");
 
   ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub;
@@ -1323,7 +1323,7 @@ static bool DisassembleLdStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     return false;
 
   assert((OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) &&
-         (OpInfo[OpIdx+1].RegClass == 0) &&
+         (OpInfo[OpIdx+1].RegClass < 0) &&
          "Expect 1 reg operand followed by 1 imm operand");
 
   ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub;
@@ -1494,7 +1494,7 @@ static bool DisassembleArithMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   // If there is still an operand info left which is an immediate operand, add
   // an additional imm5 LSL/ASR operand.
-  if (ThreeReg && OpInfo[OpIdx].RegClass == 0
+  if (ThreeReg && OpInfo[OpIdx].RegClass < 0
       && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
     // Extract the 5-bit immediate field Inst{11-7}.
     unsigned ShiftAmt = (insn >> ARMII::ShiftShift) & 0x1F;
@@ -1540,7 +1540,7 @@ static bool DisassembleExtFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   // If there is still an operand info left which is an immediate operand, add
   // an additional rotate immediate operand.
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0
+  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
       && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
     // Extract the 2-bit rotate field Inst{11-10}.
     unsigned rot = (insn >> ARMII::ExtRotImmShift) & 3;
@@ -1725,7 +1725,7 @@ static bool DisassembleVFPConv1Frm(MCInst &MI, unsigned Opcode, uint32_t insn,
            "Tied to operand expected");
     MI.addOperand(MI.getOperand(0));
 
-    assert(OpInfo[2].RegClass == 0 && !OpInfo[2].isPredicate() &&
+    assert(OpInfo[2].RegClass < 0 && !OpInfo[2].isPredicate() &&
            !OpInfo[2].isOptionalDef() && "Imm operand expected");
     MI.addOperand(MCOperand::CreateImm(fbits));
 
@@ -1984,7 +1984,7 @@ static bool DisassembleVFPMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   ++OpIdx;
 
   // Extract/decode the f64/f32 immediate.
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0
+  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
         && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
     // The asm syntax specifies the before-expanded <imm>.
     // Not VFPExpandImm(slice(insn,19,16) << 4 | slice(insn, 3, 0),
@@ -2077,42 +2077,12 @@ static unsigned decodeLaneIndex(uint32_t insn) {
 // imm3 = Inst{18-16}, imm4 = Inst{3-0}
 // Ref: Table A7-15 Modified immediate values for Advanced SIMD instructions.
 static uint64_t decodeN1VImm(uint32_t insn, ElemSize esize) {
+  unsigned char op = (insn >> 5) & 1;
   unsigned char cmode = (insn >> 8) & 0xF;
   unsigned char Imm8 = ((insn >> 24) & 1) << 7 |
                        ((insn >> 16) & 7) << 4 |
                        (insn & 0xF);
-  uint64_t Imm64 = 0;
-
-  switch (esize) {
-  case ESize8:
-    Imm64 = Imm8;
-    break;
-  case ESize16:
-    Imm64 = Imm8 << 8*(cmode >> 1 & 1);
-    break;
-  case ESize32: {
-    if (cmode == 12)
-      Imm64 = (Imm8 << 8) | 0xFF;
-    else if (cmode == 13)
-      Imm64 = (Imm8 << 16) | 0xFFFF;
-    else {
-      // Imm8 to be shifted left by how many bytes...
-      Imm64 = Imm8 << 8*(cmode >> 1 & 3);
-    }
-    break;
-  }
-  case ESize64: {
-    for (unsigned i = 0; i < 8; ++i)
-      if ((Imm8 >> i) & 1)
-        Imm64 |= (uint64_t)0xFF << 8*i;
-    break;
-  }
-  default:
-    assert(0 && "Unreachable code!");
-    return 0;
-  }
-
-  return Imm64;
+  return (op << 12) | (cmode << 8) | Imm8;
 }
 
 // A8.6.339 VMUL, VMULL (by scalar)
@@ -2303,7 +2273,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
     }
 
     assert((OpIdx+1) < NumOps && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           OpInfo[OpIdx + 1].RegClass == 0 && "Addrmode #6 Operands expected");
+           OpInfo[OpIdx + 1].RegClass < 0 && "Addrmode #6 Operands expected");
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        Rn)));
     MI.addOperand(MCOperand::CreateImm(0)); // Alignment ignored?
@@ -2320,7 +2290,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
            "Reg operand expected");
 
     RegClass = OpInfo[OpIdx].RegClass;
-    while (OpIdx < NumOps && OpInfo[OpIdx].RegClass == RegClass) {
+    while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) {
       MI.addOperand(MCOperand::CreateReg(
                       getRegisterEnum(B, RegClass, Rd,
                                       UseDRegPair(Opcode))));
@@ -2329,7 +2299,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
     }
 
     // Handle possible lane index.
-    if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0
+    if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
         && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
       MI.addOperand(MCOperand::CreateImm(decodeLaneIndex(insn)));
       ++OpIdx;
@@ -2340,7 +2310,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
     // possible TIED_TO DPR/QPR's (ignored), then possible lane index.
     RegClass = OpInfo[0].RegClass;
 
-    while (OpIdx < NumOps && OpInfo[OpIdx].RegClass == RegClass) {
+    while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) {
       MI.addOperand(MCOperand::CreateReg(
                       getRegisterEnum(B, RegClass, Rd,
                                       UseDRegPair(Opcode))));
@@ -2355,7 +2325,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
     }
 
     assert((OpIdx+1) < NumOps && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           OpInfo[OpIdx + 1].RegClass == 0 && "Addrmode #6 Operands expected");
+           OpInfo[OpIdx + 1].RegClass < 0 && "Addrmode #6 Operands expected");
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        Rn)));
     MI.addOperand(MCOperand::CreateImm(0)); // Alignment ignored?
@@ -2366,7 +2336,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
       ++OpIdx;
     }
 
-    while (OpIdx < NumOps && OpInfo[OpIdx].RegClass == RegClass) {
+    while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) {
       assert(TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1 &&
              "Tied to operand expected");
       MI.addOperand(MCOperand::CreateReg(0));
@@ -2374,7 +2344,7 @@ static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
     }
 
     // Handle possible lane index.
-    if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0
+    if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
         && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
       MI.addOperand(MCOperand::CreateImm(decodeLaneIndex(insn)));
       ++OpIdx;
@@ -2438,7 +2408,7 @@ static bool DisassembleN1RegModImmFrm(MCInst &MI, unsigned Opcode,
   assert(NumOps >= 2 &&
          (OpInfo[0].RegClass == ARM::DPRRegClassID ||
           OpInfo[0].RegClass == ARM::QPRRegClassID) &&
-         (OpInfo[1].RegClass == 0) &&
+         (OpInfo[1].RegClass < 0) &&
          "Expect 1 reg operand followed by 1 imm operand");
 
   // Qd/Dd = Inst{22:15-12} => NEON Rd
@@ -2552,7 +2522,7 @@ static bool DisassembleNVdVmOptImm(MCInst &MI, unsigned Opc, uint32_t insn,
   }
 
   // Add the imm operand, if required.
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0
+  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
       && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
 
     unsigned imm = 0xFFFFFFFF;
@@ -2632,7 +2602,7 @@ static bool DisassembleNVectorShift(MCInst &MI, unsigned Opcode, uint32_t insn,
                                                      decodeNEONRm(insn))));
   ++OpIdx;
 
-  assert(OpInfo[OpIdx].RegClass == 0 && "Imm operand expected");
+  assert(OpInfo[OpIdx].RegClass < 0 && "Imm operand expected");
 
   // Add the imm operand.
   
@@ -2762,7 +2732,7 @@ static bool DisassembleNVdVnVmOptImm(MCInst &MI, unsigned Opcode, uint32_t insn,
                   getRegisterEnum(B, OpInfo[OpIdx].RegClass, m)));
   ++OpIdx;
 
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0
+  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
       && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
     // Add the imm operand.
     unsigned Imm = 0;
@@ -2869,15 +2839,9 @@ static bool DisassembleNVTBLFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   return true;
 }
 
-static bool DisassembleNEONFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO) {
-  assert(0 && "Unreachable code!");
-  return false;
-}
-
 // Vector Get Lane (move scalar to ARM core register) Instructions.
 // VGETLNi32, VGETLNs16, VGETLNs8, VGETLNu16, VGETLNu8: Rt Dn index
-static bool DisassembleNEONGetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
+static bool DisassembleNGetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
   const TargetInstrDesc &TID = ARMInsts[Opcode];
@@ -2887,7 +2851,7 @@ static bool DisassembleNEONGetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   assert(TID.getNumDefs() == 1 && NumOps >= 3 &&
          OpInfo[0].RegClass == ARM::GPRRegClassID &&
          OpInfo[1].RegClass == ARM::DPRRegClassID &&
-         OpInfo[2].RegClass == 0 &&
+         OpInfo[2].RegClass < 0 &&
          "Expect >= 3 operands with one dst operand");
 
   ElemSize esize =
@@ -2911,7 +2875,7 @@ static bool DisassembleNEONGetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
 // Vector Set Lane (move ARM core register to scalar) Instructions.
 // VSETLNi16, VSETLNi32, VSETLNi8: Dd Dd (TIED_TO) Rt index
-static bool DisassembleNEONSetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
+static bool DisassembleNSetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
   const TargetInstrDesc &TID = ARMInsts[Opcode];
@@ -2923,7 +2887,7 @@ static bool DisassembleNEONSetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
          OpInfo[1].RegClass == ARM::DPRRegClassID &&
          TID.getOperandConstraint(1, TOI::TIED_TO) != -1 &&
          OpInfo[2].RegClass == ARM::GPRRegClassID &&
-         OpInfo[3].RegClass == 0 &&
+         OpInfo[3].RegClass < 0 &&
          "Expect >= 3 operands with one dst operand");
 
   ElemSize esize =
@@ -2950,7 +2914,7 @@ static bool DisassembleNEONSetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
 // Vector Duplicate Instructions (from ARM core register to all elements).
 // VDUP8d, VDUP16d, VDUP32d, VDUP8q, VDUP16q, VDUP32q: Qd/Dd Rt
-static bool DisassembleNEONDupFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
+static bool DisassembleNDupFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
   const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
@@ -3090,13 +3054,6 @@ static bool DisassembleMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   return false;
 }
 
-static bool DisassembleThumbMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO) {
-
-  assert(0 && "Unexpected thumb misc. instruction!");
-  return false;
-}
-
 /// FuncPtrs - FuncPtrs maps ARMFormat to its corresponding DisassembleFP.
 /// We divide the disassembly task into different categories, with each one
 /// corresponding to a specific instruction encoding format.  There could be
@@ -3128,12 +3085,10 @@ static const DisassembleFP FuncPtrs[] = {
   &DisassembleVFPLdStMulFrm,
   &DisassembleVFPMiscFrm,
   &DisassembleThumbFrm,
-  &DisassembleNEONFrm,
-  &DisassembleNEONGetLnFrm,
-  &DisassembleNEONSetLnFrm,
-  &DisassembleNEONDupFrm,
   &DisassembleMiscFrm,
-  &DisassembleThumbMiscFrm,
+  &DisassembleNGetLnFrm,
+  &DisassembleNSetLnFrm,
+  &DisassembleNDupFrm,
 
   // VLD and VST (including one lane) Instructions.
   &DisassembleNLdSt,
@@ -3233,7 +3188,8 @@ bool ARMBasicMCBuilder::DoPredicateOperands(MCInst& MI, unsigned Opcode,
   // a pair of TargetOperandInfos with isPredicate() property.
   if (NumOpsRemaining >= 2 &&
       OpInfo[Idx].isPredicate() && OpInfo[Idx+1].isPredicate() &&
-      OpInfo[Idx].RegClass == 0 && OpInfo[Idx+1].RegClass == ARM::CCRRegClassID)
+      OpInfo[Idx].RegClass < 0 &&
+      OpInfo[Idx+1].RegClass == ARM::CCRRegClassID)
   {
     // If we are inside an IT block, get the IT condition bits maintained via
     // ARMBasicMCBuilder::ITState[7:0], through ARMBasicMCBuilder::GetITCond().
@@ -3265,7 +3221,8 @@ bool ARMBasicMCBuilder::TryPredicateAndSBitModifier(MCInst& MI, unsigned Opcode,
   // a pair of TargetOperandInfos with isPredicate() property.
   if (NumOpsRemaining >= 2 &&
       OpInfo[Idx].isPredicate() && OpInfo[Idx+1].isPredicate() &&
-      OpInfo[Idx].RegClass == 0 && OpInfo[Idx+1].RegClass == ARM::CCRRegClassID)
+      OpInfo[Idx].RegClass < 0 &&
+      OpInfo[Idx+1].RegClass == ARM::CCRRegClassID)
   {
     // If we are inside an IT block, get the IT condition bits maintained via
     // ARMBasicMCBuilder::ITState[7:0], through ARMBasicMCBuilder::GetITCond().
diff --git a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
index b1d90df34177..7d21256a14f9 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
+++ b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
@@ -137,25 +137,25 @@ static inline void setSlice(uint32_t &Bits, unsigned From, unsigned To,
 /// Various utilities for checking the target specific flags.
 
 /// A unary data processing instruction doesn't have an Rn operand.
-static inline bool isUnaryDP(unsigned TSFlags) {
+static inline bool isUnaryDP(uint64_t TSFlags) {
   return (TSFlags & ARMII::UnaryDP);
 }
 
 /// This four-bit field describes the addressing mode used.
 /// See also ARMBaseInstrInfo.h.
-static inline unsigned getAddrMode(unsigned TSFlags) {
+static inline unsigned getAddrMode(uint64_t TSFlags) {
   return (TSFlags & ARMII::AddrModeMask);
 }
 
 /// {IndexModePre, IndexModePost}
 /// Only valid for load and store ops.
 /// See also ARMBaseInstrInfo.h.
-static inline unsigned getIndexMode(unsigned TSFlags) {
+static inline unsigned getIndexMode(uint64_t TSFlags) {
   return (TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift;
 }
 
 /// Pre-/post-indexed operations define an extra $base_wb in the OutOperandList.
-static inline bool isPrePostLdSt(unsigned TSFlags) {
+static inline bool isPrePostLdSt(uint64_t TSFlags) {
   return (TSFlags & ARMII::IndexModeMask) != 0;
 }
 
diff --git a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h b/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
index 4b2e308248c5..4b7a0bf6fdb9 100644
--- a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
+++ b/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
@@ -395,7 +395,7 @@ static bool DisassembleThumb1General(MCInst &MI, unsigned Opcode, uint32_t insn,
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
                                                        getT1tRm(insn))));
   } else {
-    assert(OpInfo[OpIdx].RegClass == 0 &&
+    assert(OpInfo[OpIdx].RegClass < 0 &&
            !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()
            && "Pure imm operand expected");
     MI.addOperand(MCOperand::CreateImm(UseRt ? getT1Imm8(insn)
@@ -531,7 +531,7 @@ static bool DisassembleThumb1LdPC(MCInst &MI, unsigned Opcode, uint32_t insn,
   if (!OpInfo) return false;
 
   assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID &&
-         (OpInfo[1].RegClass == 0 &&
+         (OpInfo[1].RegClass < 0 &&
           !OpInfo[1].isPredicate() &&
           !OpInfo[1].isOptionalDef())
          && "Invalid arguments");
@@ -598,7 +598,7 @@ static bool DisassembleThumb1LdSt(unsigned opA, MCInst &MI, unsigned Opcode,
 
   assert(OpIdx < NumOps && "More operands expected");
 
-  if (OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate() &&
+  if (OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() &&
       !OpInfo[OpIdx].isOptionalDef()) {
 
     MI.addOperand(MCOperand::CreateImm(Imm5 ? getT1Imm5(insn) : 0));
@@ -632,7 +632,7 @@ static bool DisassembleThumb1LdStSP(MCInst &MI, unsigned Opcode, uint32_t insn,
   assert(NumOps >= 3 &&
          OpInfo[0].RegClass == ARM::tGPRRegClassID &&
          OpInfo[1].RegClass == ARM::GPRRegClassID &&
-         (OpInfo[2].RegClass == 0 &&
+         (OpInfo[2].RegClass < 0 &&
           !OpInfo[2].isPredicate() &&
           !OpInfo[2].isOptionalDef())
          && "Invalid arguments");
@@ -658,7 +658,7 @@ static bool DisassembleThumb1AddPCi(MCInst &MI, unsigned Opcode, uint32_t insn,
   if (!OpInfo) return false;
 
   assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID &&
-         (OpInfo[1].RegClass == 0 &&
+         (OpInfo[1].RegClass < 0 &&
           !OpInfo[1].isPredicate() &&
           !OpInfo[1].isOptionalDef())
          && "Invalid arguments");
@@ -685,7 +685,7 @@ static bool DisassembleThumb1AddSPi(MCInst &MI, unsigned Opcode, uint32_t insn,
   assert(NumOps >= 3 &&
          OpInfo[0].RegClass == ARM::tGPRRegClassID &&
          OpInfo[1].RegClass == ARM::GPRRegClassID &&
-         (OpInfo[2].RegClass == 0 &&
+         (OpInfo[2].RegClass < 0 &&
           !OpInfo[2].isPredicate() &&
           !OpInfo[2].isOptionalDef())
          && "Invalid arguments");
@@ -761,7 +761,7 @@ static bool DisassembleThumb1Misc(MCInst &MI, unsigned Opcode, uint32_t insn,
   // Predicate operands are handled elsewhere.
   if (NumOps == 2 &&
       OpInfo[0].isPredicate() && OpInfo[1].isPredicate() &&
-      OpInfo[0].RegClass == 0 && OpInfo[1].RegClass == ARM::CCRRegClassID) {
+      OpInfo[0].RegClass < 0 && OpInfo[1].RegClass == ARM::CCRRegClassID) {
     return true;
   }
 
@@ -808,7 +808,7 @@ static bool DisassembleThumb1Misc(MCInst &MI, unsigned Opcode, uint32_t insn,
   }
 
   assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID &&
-         (OpInfo[1].RegClass==0 || OpInfo[1].RegClass==ARM::tGPRRegClassID)
+         (OpInfo[1].RegClass < 0 || OpInfo[1].RegClass==ARM::tGPRRegClassID)
          && "Expect >=2 operands");
 
   // Add the destination operand.
@@ -913,7 +913,7 @@ static bool DisassembleThumb1CondBr(MCInst &MI, unsigned Opcode, uint32_t insn,
   const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
   if (!OpInfo) return false;
 
-  assert(NumOps == 3 && OpInfo[0].RegClass == 0 &&
+  assert(NumOps == 3 && OpInfo[0].RegClass < 0 &&
          OpInfo[1].isPredicate() && OpInfo[2].RegClass == ARM::CCRRegClassID
          && "Exactly 3 operands expected");
 
@@ -939,7 +939,7 @@ static bool DisassembleThumb1Br(MCInst &MI, unsigned Opcode, uint32_t insn,
   const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
   if (!OpInfo) return false;
 
-  assert(NumOps == 1 && OpInfo[0].RegClass == 0 && "1 imm operand expected");
+  assert(NumOps == 1 && OpInfo[0].RegClass < 0 && "1 imm operand expected");
 
   unsigned Imm11 = getT1Imm11(insn);
 
@@ -1239,7 +1239,7 @@ static bool DisassembleThumb2LdStDual(MCInst &MI, unsigned Opcode,
          && OpInfo[0].RegClass == ARM::GPRRegClassID
          && OpInfo[1].RegClass == ARM::GPRRegClassID
          && OpInfo[2].RegClass == ARM::GPRRegClassID
-         && OpInfo[3].RegClass == 0
+         && OpInfo[3].RegClass < 0
          && "Expect >= 4 operands and first 3 as reg operands");
 
   // Add the <Rt> <Rt2> operands.
@@ -1322,8 +1322,8 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn,
     assert(NumOps == 4
            && OpInfo[0].RegClass == ARM::GPRRegClassID
            && OpInfo[1].RegClass == ARM::GPRRegClassID
-           && OpInfo[2].RegClass == 0
-           && OpInfo[3].RegClass == 0
+           && OpInfo[2].RegClass < 0
+           && OpInfo[3].RegClass < 0
            && "Exactlt 4 operands expect and first two as reg operands");
     // Only need to populate the src reg operand.
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
@@ -1375,7 +1375,7 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn,
   if (NumOps == OpIdx)
     return true;
 
-  if (OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate()
+  if (OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
       && !OpInfo[OpIdx].isOptionalDef()) {
 
     if (Thumb2ShiftOpcode(Opcode))
@@ -1440,7 +1440,7 @@ static bool DisassembleThumb2DPModImm(MCInst &MI, unsigned Opcode,
   }
 
   // The modified immediate operand should come next.
-  assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 &&
+  assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0 &&
          !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()
          && "Pure imm operand expected");
 
@@ -1555,7 +1555,7 @@ static bool DisassembleThumb2DPBinImm(MCInst &MI, unsigned Opcode,
     ++OpIdx;
   }
 
-  assert(OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate()
+  assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
          && !OpInfo[OpIdx].isOptionalDef()
          && "Pure imm operand expected");
 
@@ -1772,7 +1772,7 @@ static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn,
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        decodeRm(insn))));
   } else {
-    assert(OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate()
+    assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
            && !OpInfo[OpIdx].isOptionalDef()
            && "Pure imm operand expected");
     int Offset = 0;
@@ -1792,7 +1792,7 @@ static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn,
   }
   ++OpIdx;
 
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0 &&
+  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0 &&
       !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
     // Fills in the shift amount for t2PLDs, t2PLDWs, t2PLIs.
     MI.addOperand(MCOperand::CreateImm(slice(insn, 5, 4)));
@@ -1818,7 +1818,7 @@ static bool DisassembleThumb2Ldpci(MCInst &MI, unsigned Opcode,
 
   assert(NumOps >= 2 &&
          OpInfo[0].RegClass == ARM::GPRRegClassID &&
-         OpInfo[1].RegClass == 0 &&
+         OpInfo[1].RegClass < 0 &&
          "Expect >= 2 operands, first as reg, and second as imm operand");
 
   // Build the register operand, followed by the (+/-)imm12 immediate.
@@ -1930,7 +1930,7 @@ static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode,
     ++OpIdx;
   }
 
-  assert(OpInfo[OpIdx].RegClass == 0 && !OpInfo[OpIdx].isPredicate()
+  assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
          && !OpInfo[OpIdx].isOptionalDef()
          && "Pure imm operand expected");
 
@@ -1981,7 +1981,7 @@ static bool DisassembleThumb2DPReg(MCInst &MI, unsigned Opcode, uint32_t insn,
                                                      decodeRm(insn))));
   ++OpIdx;
 
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == 0
+  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
       && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
     // Add the rotation amount immediate.
     MI.addOperand(MCOperand::CreateImm(decodeRotate(insn)));
diff --git a/lib/Target/ARM/NEONMoveFix.cpp b/lib/Target/ARM/NEONMoveFix.cpp
index 0a4400cc7ddd..bbdd3c7f7c3e 100644
--- a/lib/Target/ARM/NEONMoveFix.cpp
+++ b/lib/Target/ARM/NEONMoveFix.cpp
@@ -105,8 +105,8 @@ bool NEONMoveFixPass::InsertMoves(MachineBasicBlock &MBB) {
       unsigned MOReg = MO.getReg();
 
       Defs[MOReg] = MI;
-      // Catch subregs as well.
-      for (const unsigned *R = TRI->getSubRegisters(MOReg); *R; ++R)
+      // Catch aliases as well.
+      for (const unsigned *R = TRI->getAliasSet(MOReg); *R; ++R)
         Defs[*R] = MI;
     }
   }
diff --git a/lib/Target/ARM/NEONPreAllocPass.cpp b/lib/Target/ARM/NEONPreAllocPass.cpp
index a7258985e104..f67717cdd56f 100644
--- a/lib/Target/ARM/NEONPreAllocPass.cpp
+++ b/lib/Target/ARM/NEONPreAllocPass.cpp
@@ -407,7 +407,7 @@ NEONPreAllocPass::FormsRegSequence(MachineInstr *MI,
            "expected a virtual register");
     // Extracting from a Q or QQ register.
     MachineInstr *DefMI = MRI->getVRegDef(VirtReg);
-    if (!DefMI || !DefMI->isExtractSubreg())
+    if (!DefMI || !DefMI->isCopy() || !DefMI->getOperand(1).getSubReg())
       return false;
     VirtReg = DefMI->getOperand(1).getReg();
     if (LastSrcReg && LastSrcReg != VirtReg)
@@ -418,7 +418,7 @@ NEONPreAllocPass::FormsRegSequence(MachineInstr *MI,
         RC != ARM::QQPRRegisterClass &&
         RC != ARM::QQQQPRRegisterClass)
       return false;
-    unsigned SubIdx = DefMI->getOperand(2).getImm();
+    unsigned SubIdx = DefMI->getOperand(1).getSubReg();
     if (LastSubIdx) {
       if (LastSubIdx != SubIdx-Stride)
         return false;
@@ -434,22 +434,21 @@ NEONPreAllocPass::FormsRegSequence(MachineInstr *MI,
 
   // FIXME: Update the uses of EXTRACT_SUBREG from REG_SEQUENCE is
   // currently required for correctness. e.g.
-  //  %reg1041;<def> = REG_SEQUENCE %reg1040<kill>, 5, %reg1035<kill>, 6
+  //  %reg1041<def> = REG_SEQUENCE %reg1040<kill>, 5, %reg1035<kill>, 6
   //  %reg1042<def> = EXTRACT_SUBREG %reg1041, 6
   //  %reg1043<def> = EXTRACT_SUBREG %reg1041, 5
   //  VST1q16 %reg1025<kill>, 0, %reg1043<kill>, %reg1042<kill>,
-  // reg1025 and reg1043 should be replaced with reg1041:6 and reg1041:5
+  // reg1042 and reg1043 should be replaced with reg1041:6 and reg1041:5
   // respectively.
   // We need to change how we model uses of REG_SEQUENCE.
   for (unsigned R = 0; R < NumRegs; ++R) {
     MachineOperand &MO = MI->getOperand(FirstOpnd + R);
     unsigned OldReg = MO.getReg();
     MachineInstr *DefMI = MRI->getVRegDef(OldReg);
-    assert(DefMI->isExtractSubreg());
+    assert(DefMI->isCopy());
     MO.setReg(LastSrcReg);
     MO.setSubReg(SubIds[R]);
-    if (R != 0)
-      MO.setIsKill(false);
+    MO.setIsKill(false);
     // Delete the EXTRACT_SUBREG if its result is now dead.
     if (MRI->use_empty(OldReg))
       DefMI->eraseFromParent();
@@ -467,43 +466,9 @@ bool NEONPreAllocPass::PreAllocNEONRegisters(MachineBasicBlock &MBB) {
     unsigned FirstOpnd, NumRegs, Offset, Stride;
     if (!isNEONMultiRegOp(MI->getOpcode(), FirstOpnd, NumRegs, Offset, Stride))
       continue;
-    if (llvm::ModelWithRegSequence() &&
-        FormsRegSequence(MI, FirstOpnd, NumRegs, Offset, Stride))
+    if (FormsRegSequence(MI, FirstOpnd, NumRegs, Offset, Stride))
       continue;
-
-    MachineBasicBlock::iterator NextI = llvm::next(MBBI);
-    for (unsigned R = 0; R < NumRegs; ++R) {
-      MachineOperand &MO = MI->getOperand(FirstOpnd + R);
-      assert(MO.isReg() && MO.getSubReg() == 0 && "unexpected operand");
-      unsigned VirtReg = MO.getReg();
-      assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
-             "expected a virtual register");
-
-      // For now, just assign a fixed set of adjacent registers.
-      // This leaves plenty of room for future improvements.
-      static const unsigned NEONDRegs[] = {
-        ARM::D0, ARM::D1, ARM::D2, ARM::D3,
-        ARM::D4, ARM::D5, ARM::D6, ARM::D7
-      };
-      MO.setReg(NEONDRegs[Offset + R * Stride]);
-
-      if (MO.isUse()) {
-        // Insert a copy from VirtReg.
-        TII->copyRegToReg(MBB, MBBI, MO.getReg(), VirtReg,
-                          ARM::DPRRegisterClass, ARM::DPRRegisterClass,
-                          DebugLoc());
-        if (MO.isKill()) {
-          MachineInstr *CopyMI = prior(MBBI);
-          CopyMI->findRegisterUseOperand(VirtReg)->setIsKill();
-        }
-        MO.setIsKill();
-      } else if (MO.isDef() && !MO.isDead()) {
-        // Add a copy to VirtReg.
-        TII->copyRegToReg(MBB, NextI, VirtReg, MO.getReg(),
-                          ARM::DPRRegisterClass, ARM::DPRRegisterClass,
-                          DebugLoc());
-      }
-    }
+    llvm_unreachable("expected a REG_SEQUENCE");
   }
 
   return Modified;
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index fae84d4ee022..af630ac797c5 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -33,64 +33,24 @@ unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const {
   return 0;
 }
 
-bool Thumb1InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator I,
-                                   unsigned DestReg, unsigned SrcReg,
-                                   const TargetRegisterClass *DestRC,
-                                   const TargetRegisterClass *SrcRC,
-                                   DebugLoc DL) const {
-  if (DestRC == ARM::GPRRegisterClass) {
-    if (SrcRC == ARM::GPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVgpr2gpr), DestReg).addReg(SrcReg);
-      return true;
-    } else if (SrcRC == ARM::tGPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVtgpr2gpr), DestReg).addReg(SrcReg);
-      return true;
-    }
-  } else if (DestRC == ARM::tGPRRegisterClass) {
-    if (SrcRC == ARM::GPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVgpr2tgpr), DestReg).addReg(SrcReg);
-      return true;
-    } else if (SrcRC == ARM::tGPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg).addReg(SrcReg);
-      return true;
-    }
-  }
-
-  return false;
-}
-
-bool Thumb1InstrInfo::
-canFoldMemoryOperand(const MachineInstr *MI,
-                     const SmallVectorImpl<unsigned> &Ops) const {
-  if (Ops.size() != 1) return false;
-
-  unsigned OpNum = Ops[0];
-  unsigned Opc = MI->getOpcode();
-  switch (Opc) {
-  default: break;
-  case ARM::tMOVr:
-  case ARM::tMOVtgpr2gpr:
-  case ARM::tMOVgpr2tgpr:
-  case ARM::tMOVgpr2gpr: {
-    if (OpNum == 0) { // move -> store
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
-          !isARMLowRegister(SrcReg))
-        // tSpill cannot take a high register operand.
-        return false;
-    } else {          // move -> load
-      unsigned DstReg = MI->getOperand(0).getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(DstReg) &&
-          !isARMLowRegister(DstReg))
-        // tRestore cannot target a high register operand.
-        return false;
-    }
-    return true;
-  }
-  }
-
-  return false;
+void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
+  bool tDest = ARM::tGPRRegClass.contains(DestReg);
+  bool tSrc  = ARM::tGPRRegClass.contains(SrcReg);
+  unsigned Opc = ARM::tMOVgpr2gpr;
+  if (tDest && tSrc)
+    Opc = ARM::tMOVr;
+  else if (tSrc)
+    Opc = ARM::tMOVtgpr2gpr;
+  else if (tDest)
+    Opc = ARM::tMOVgpr2tgpr;
+
+  BuildMI(MBB, I, DL, get(Opc), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  assert(ARM::GPRRegClass.contains(DestReg, SrcReg) &&
+         "Thumb1 can only copy GPR registers");
 }
 
 void Thumb1InstrInfo::
@@ -175,10 +135,10 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
         isKill = false;
     }
 
-    if (isKill) {
+    if (isKill)
       MBB.addLiveIn(Reg);
-      MIB.addReg(Reg, RegState::Kill);
-    }
+
+    MIB.addReg(Reg, getKillRegState(isKill));
   }
   return true;
 }
@@ -221,46 +181,3 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 
   return true;
 }
-
-MachineInstr *Thumb1InstrInfo::
-foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                      const SmallVectorImpl<unsigned> &Ops, int FI) const {
-  if (Ops.size() != 1) return NULL;
-
-  unsigned OpNum = Ops[0];
-  unsigned Opc = MI->getOpcode();
-  MachineInstr *NewMI = NULL;
-  switch (Opc) {
-  default: break;
-  case ARM::tMOVr:
-  case ARM::tMOVtgpr2gpr:
-  case ARM::tMOVgpr2tgpr:
-  case ARM::tMOVgpr2gpr: {
-    if (OpNum == 0) { // move -> store
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      bool isKill = MI->getOperand(1).isKill();
-      if (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
-          !isARMLowRegister(SrcReg))
-        // tSpill cannot take a high register operand.
-        break;
-      NewMI = AddDefaultPred(BuildMI(MF, MI->getDebugLoc(), get(ARM::tSpill))
-                             .addReg(SrcReg, getKillRegState(isKill))
-                             .addFrameIndex(FI).addImm(0));
-    } else {          // move -> load
-      unsigned DstReg = MI->getOperand(0).getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(DstReg) &&
-          !isARMLowRegister(DstReg))
-        // tRestore cannot target a high register operand.
-        break;
-      bool isDead = MI->getOperand(0).isDead();
-      NewMI = AddDefaultPred(BuildMI(MF, MI->getDebugLoc(), get(ARM::tRestore))
-                             .addReg(DstReg,
-                                     RegState::Define | getDeadRegState(isDead))
-                             .addFrameIndex(FI).addImm(0));
-    }
-    break;
-  }
-  }
-
-  return NewMI;
-}
diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h
index c937296bbee1..555135a8b76c 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/lib/Target/ARM/Thumb1InstrInfo.h
@@ -46,12 +46,10 @@ public:
                                    const std::vector<CalleeSavedInfo> &CSI,
                                    const TargetRegisterInfo *TRI) const;
 
-  bool copyRegToReg(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator I,
-                            unsigned DestReg, unsigned SrcReg,
-                            const TargetRegisterClass *DestRC,
-                            const TargetRegisterClass *SrcRC,
-                            DebugLoc DL) const;
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const;
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
                                    unsigned SrcReg, bool isKill, int FrameIndex,
@@ -64,20 +62,6 @@ public:
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const;
 
-  bool canFoldMemoryOperand(const MachineInstr *MI,
-                                    const SmallVectorImpl<unsigned> &Ops) const;
-
-  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                      MachineInstr* MI,
-                                      const SmallVectorImpl<unsigned> &Ops,
-                                      int FrameIndex) const;
-
-  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                      MachineInstr* MI,
-                                      const SmallVectorImpl<unsigned> &Ops,
-                                      MachineInstr* LoadMI) const {
-    return 0;
-  }
 };
 }
 
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index 2f635fe50ad5..39b70b43b23f 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -68,21 +68,6 @@ void Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
           .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg);
 }
 
-const TargetRegisterClass*
-Thumb1RegisterInfo::getPhysicalRegisterRegClass(unsigned Reg, EVT VT) const {
-  if (isARMLowRegister(Reg))
-    return ARM::tGPRRegisterClass;
-  switch (Reg) {
-   default:
-    break;
-   case ARM::R8:  case ARM::R9:  case ARM::R10:  case ARM::R11:
-   case ARM::R12: case ARM::SP:  case ARM::LR:   case ARM::PC:
-    return ARM::GPRRegisterClass;
-  }
-
-  return TargetRegisterInfo::getPhysicalRegisterRegClass(Reg, VT);
-}
-
 bool Thumb1RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
   const MachineFrameInfo *FFI = MF.getFrameInfo();
   unsigned CFSize = FFI->getMaxCallFrameSize();
@@ -410,6 +395,8 @@ Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
   // before that instead and adjust the UseMI.
   bool done = false;
   for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) {
+    if (II->isDebugValue())
+      continue;
     // If this instruction affects R12, adjust our restore point.
     for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
       const MachineOperand &MO = II->getOperand(i);
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
index 4eca3673391f..9a0308afa20c 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -38,9 +38,6 @@ public:
                         unsigned PredReg = 0) const;
 
   /// Code Generation virtual methods...
-  const TargetRegisterClass *
-    getPhysicalRegisterRegClass(unsigned Reg, EVT VT = MVT::Other) const;
-
   bool hasReservedCallFrame(MachineFunction &MF) const;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
@@ -51,7 +48,8 @@ public:
   // could not be handled directly in MI.
   int rewriteFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
                         unsigned FrameReg, int Offset,
-                        unsigned MOVOpc, unsigned ADDriOpc, unsigned SUBriOpc) const;
+                        unsigned MOVOpc, unsigned ADDriOpc,
+                        unsigned SUBriOpc) const;
 
   bool saveScavengerRegister(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator I,
diff --git a/lib/Target/ARM/Thumb2HazardRecognizer.cpp b/lib/Target/ARM/Thumb2HazardRecognizer.cpp
new file mode 100644
index 000000000000..172908da228a
--- /dev/null
+++ b/lib/Target/ARM/Thumb2HazardRecognizer.cpp
@@ -0,0 +1,53 @@
+//===-- Thumb2HazardRecognizer.cpp - Thumb2 postra hazard recognizer ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "Thumb2HazardRecognizer.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+using namespace llvm;
+
+ScheduleHazardRecognizer::HazardType
+Thumb2HazardRecognizer::getHazardType(SUnit *SU) {
+  if (ITBlockSize) {
+    MachineInstr *MI = SU->getInstr();
+    if (!MI->isDebugValue() && MI != ITBlockMIs[ITBlockSize-1])
+      return Hazard;
+  }
+
+  return PostRAHazardRecognizer::getHazardType(SU);
+}
+
+void Thumb2HazardRecognizer::Reset() {
+  ITBlockSize = 0;
+  PostRAHazardRecognizer::Reset();
+}
+
+void Thumb2HazardRecognizer::EmitInstruction(SUnit *SU) {
+  MachineInstr *MI = SU->getInstr();
+  unsigned Opcode = MI->getOpcode();
+  if (ITBlockSize) {
+    --ITBlockSize;
+  } else if (Opcode == ARM::t2IT) {
+    unsigned Mask = MI->getOperand(1).getImm();
+    unsigned NumTZ = CountTrailingZeros_32(Mask);
+    assert(NumTZ <= 3 && "Invalid IT mask!");
+    ITBlockSize = 4 - NumTZ;
+    MachineBasicBlock::iterator I = MI;
+    for (unsigned i = 0; i < ITBlockSize; ++i) {
+      // Advance to the next instruction, skipping any dbg_value instructions.
+      do {
+        ++I;
+      } while (I->isDebugValue());
+      ITBlockMIs[ITBlockSize-1-i] = &*I;
+    }
+  }
+
+  PostRAHazardRecognizer::EmitInstruction(SU);
+}
diff --git a/lib/Target/ARM/Thumb2HazardRecognizer.h b/lib/Target/ARM/Thumb2HazardRecognizer.h
new file mode 100644
index 000000000000..472665862e41
--- /dev/null
+++ b/lib/Target/ARM/Thumb2HazardRecognizer.h
@@ -0,0 +1,40 @@
+//===-- Thumb2HazardRecognizer.h - Thumb2 Hazard Recognizers ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling Thumb2 functions on
+// ARM processors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef THUMB2HAZARDRECOGNIZER_H
+#define THUMB2HAZARDRECOGNIZER_H
+
+#include "llvm/CodeGen/PostRAHazardRecognizer.h"
+
+namespace llvm {
+
+class MachineInstr;
+
+class Thumb2HazardRecognizer : public PostRAHazardRecognizer {
+  unsigned ITBlockSize;  // No. of MIs in current IT block yet to be scheduled.
+  MachineInstr *ITBlockMIs[4];
+
+public:
+  Thumb2HazardRecognizer(const InstrItineraryData &ItinData) :
+    PostRAHazardRecognizer(ItinData) {}
+
+  virtual HazardType getHazardType(SUnit *SU);
+  virtual void Reset();
+  virtual void EmitInstruction(SUnit *SU);
+};
+
+
+} // end namespace llvm
+
+#endif // THUMB2HAZARDRECOGNIZER_H
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index f36d4ef7567e..cd15bbed9f23 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -14,17 +14,23 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
-STATISTIC(NumITs,     "Number of IT blocks inserted");
+STATISTIC(NumITs,        "Number of IT blocks inserted");
+STATISTIC(NumMovedInsts, "Number of predicated instructions moved");
 
 namespace {
-  struct Thumb2ITBlockPass : public MachineFunctionPass {
+  class Thumb2ITBlockPass : public MachineFunctionPass {
+    bool PreRegAlloc;
+
+  public:
     static char ID;
     Thumb2ITBlockPass() : MachineFunctionPass(&ID) {}
 
     const Thumb2InstrInfo *TII;
+    const TargetRegisterInfo *TRI;
     ARMFunctionInfo *AFI;
 
     virtual bool runOnMachineFunction(MachineFunction &Fn);
@@ -34,61 +40,167 @@ namespace {
     }
 
   private:
-    bool InsertITBlocks(MachineBasicBlock &MBB);
+    bool MoveCopyOutOfITBlock(MachineInstr *MI,
+                              ARMCC::CondCodes CC, ARMCC::CondCodes OCC,
+                              SmallSet<unsigned, 4> &Defs,
+                              SmallSet<unsigned, 4> &Uses);
+    bool InsertITInstructions(MachineBasicBlock &MBB);
   };
   char Thumb2ITBlockPass::ID = 0;
 }
 
-static ARMCC::CondCodes getPredicate(const MachineInstr *MI, unsigned &PredReg){
-  unsigned Opc = MI->getOpcode();
-  if (Opc == ARM::tBcc || Opc == ARM::t2Bcc)
-    return ARMCC::AL;
-  return llvm::getInstrPredicate(MI, PredReg);
+/// TrackDefUses - Tracking what registers are being defined and used by
+/// instructions in the IT block. This also tracks "dependencies", i.e. uses
+/// in the IT block that are defined before the IT instruction.
+static void TrackDefUses(MachineInstr *MI,
+                         SmallSet<unsigned, 4> &Defs,
+                         SmallSet<unsigned, 4> &Uses,
+                         const TargetRegisterInfo *TRI) {
+  SmallVector<unsigned, 4> LocalDefs;
+  SmallVector<unsigned, 4> LocalUses;
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || Reg == ARM::ITSTATE || Reg == ARM::SP)
+      continue;
+    if (MO.isUse())
+      LocalUses.push_back(Reg);
+    else
+      LocalDefs.push_back(Reg);
+  }
+
+  for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) {
+    unsigned Reg = LocalUses[i];
+    Uses.insert(Reg);
+    for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
+         *Subreg; ++Subreg)
+      Uses.insert(*Subreg);
+  }
+
+  for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
+    unsigned Reg = LocalDefs[i];
+    Defs.insert(Reg);
+    for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
+         *Subreg; ++Subreg)
+      Defs.insert(*Subreg);
+    if (Reg == ARM::CPSR)
+      continue;
+  }
+}
+
+bool
+Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI,
+                                      ARMCC::CondCodes CC, ARMCC::CondCodes OCC,
+                                        SmallSet<unsigned, 4> &Defs,
+                                        SmallSet<unsigned, 4> &Uses) {
+  unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
+  if (TII->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
+    assert(SrcSubIdx == 0 && DstSubIdx == 0 &&
+           "Sub-register indices still around?");
+    // llvm models select's as two-address instructions. That means a copy
+    // is inserted before a t2MOVccr, etc. If the copy is scheduled in
+    // between selects we would end up creating multiple IT blocks.
+
+    // First check if it's safe to move it.
+    if (Uses.count(DstReg) || Defs.count(SrcReg))
+      return false;
+
+    // Then peek at the next instruction to see if it's predicated on CC or OCC.
+    // If not, then there is nothing to be gained by moving the copy.
+    MachineBasicBlock::iterator I = MI; ++I;
+    MachineBasicBlock::iterator E = MI->getParent()->end();
+    while (I != E && I->isDebugValue())
+      ++I;
+    if (I != E) {
+      unsigned NPredReg = 0;
+      ARMCC::CondCodes NCC = llvm::getITInstrPredicate(I, NPredReg);
+      if (NCC == CC || NCC == OCC)
+        return true;
+    }
+  }
+  return false;
 }
 
-bool Thumb2ITBlockPass::InsertITBlocks(MachineBasicBlock &MBB) {
+bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
   bool Modified = false;
 
+  SmallSet<unsigned, 4> Defs;
+  SmallSet<unsigned, 4> Uses;
   MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
   while (MBBI != E) {
     MachineInstr *MI = &*MBBI;
     DebugLoc dl = MI->getDebugLoc();
     unsigned PredReg = 0;
-    ARMCC::CondCodes CC = getPredicate(MI, PredReg);
-
+    ARMCC::CondCodes CC = llvm::getITInstrPredicate(MI, PredReg);
     if (CC == ARMCC::AL) {
       ++MBBI;
       continue;
     }
 
+    Defs.clear();
+    Uses.clear();
+    TrackDefUses(MI, Defs, Uses, TRI);
+
     // Insert an IT instruction.
     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(ARM::t2IT))
       .addImm(CC);
+
+    // Add implicit use of ITSTATE to IT block instructions.
+    MI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/,
+                                             true/*isImp*/, false/*isKill*/));
+
+    MachineInstr *LastITMI = MI;
+    MachineBasicBlock::iterator InsertPos = MIB;
     ++MBBI;
 
-    // Finalize IT mask.
+    // Form IT block.
     ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC);
     unsigned Mask = 0, Pos = 3;
     // Branches, including tricky ones like LDM_RET, need to end an IT
     // block so check the instruction we just put in the block.
-    while (MBBI != E && Pos &&
-           (!MI->getDesc().isBranch() && !MI->getDesc().isReturn())) {
+    for (; MBBI != E && Pos &&
+           (!MI->getDesc().isBranch() && !MI->getDesc().isReturn()) ; ++MBBI) {
+      if (MBBI->isDebugValue())
+        continue;
+
       MachineInstr *NMI = &*MBBI;
       MI = NMI;
-      DebugLoc ndl = NMI->getDebugLoc();
+
       unsigned NPredReg = 0;
-      ARMCC::CondCodes NCC = getPredicate(NMI, NPredReg);
-      if (NCC == CC || NCC == OCC)
+      ARMCC::CondCodes NCC = llvm::getITInstrPredicate(NMI, NPredReg);
+      if (NCC == CC || NCC == OCC) {
         Mask |= (NCC & 1) << Pos;
-      else
+        // Add implicit use of ITSTATE.
+        NMI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/,
+                                               true/*isImp*/, false/*isKill*/));
+        LastITMI = NMI;
+      } else {
+        if (NCC == ARMCC::AL &&
+            MoveCopyOutOfITBlock(NMI, CC, OCC, Defs, Uses)) {
+          --MBBI;
+          MBB.remove(NMI);
+          MBB.insert(InsertPos, NMI);
+          ++NumMovedInsts;
+          continue;
+        }
         break;
+      }
+      TrackDefUses(NMI, Defs, Uses, TRI);
       --Pos;
-      ++MBBI;
     }
+
+    // Finalize IT mask.
     Mask |= (1 << Pos);
     // Tag along (firstcond[0] << 4) with the mask.
     Mask |= (CC & 1) << 4;
     MIB.addImm(Mask);
+
+    // Last instruction in IT block kills ITSTATE.
+    LastITMI->findRegisterUseOperand(ARM::ITSTATE)->setIsKill();
+
     Modified = true;
     ++NumITs;
   }
@@ -100,17 +212,21 @@ bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) {
   const TargetMachine &TM = Fn.getTarget();
   AFI = Fn.getInfo<ARMFunctionInfo>();
   TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
+  TRI = TM.getRegisterInfo();
 
   if (!AFI->isThumbFunction())
     return false;
 
   bool Modified = false;
-  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
-       ++MFI) {
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ) {
     MachineBasicBlock &MBB = *MFI;
-    Modified |= InsertITBlocks(MBB);
+    ++MFI;
+    Modified |= InsertITInstructions(MBB);
   }
 
+  if (Modified)
+    AFI->setHasITBlocks(true);
+
   return Modified;
 }
 
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 531d5e9f147e..ee517279c9d7 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -17,15 +17,27 @@
 #include "ARMAddressingModes.h"
 #include "ARMGenInstrInfo.inc"
 #include "ARMMachineFunctionInfo.h"
+#include "Thumb2HazardRecognizer.h"
+#include "Thumb2InstrInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/ADT/SmallVector.h"
-#include "Thumb2InstrInfo.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
+static cl::opt<unsigned>
+IfCvtLimit("thumb2-ifcvt-limit", cl::Hidden,
+           cl::desc("Thumb2 if-conversion limit (default 3)"),
+           cl::init(3));
+
+static cl::opt<unsigned>
+IfCvtDiamondLimit("thumb2-ifcvt-diamond-limit", cl::Hidden,
+                  cl::desc("Thumb2 diamond if-conversion limit (default 3)"),
+                  cl::init(3));
+
 Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
   : ARMBaseInstrInfo(STI), RI(*this, STI) {
 }
@@ -35,33 +47,99 @@ unsigned Thumb2InstrInfo::getUnindexedOpcode(unsigned Opc) const {
   return 0;
 }
 
-bool
-Thumb2InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I,
-                              unsigned DestReg, unsigned SrcReg,
-                              const TargetRegisterClass *DestRC,
-                              const TargetRegisterClass *SrcRC,
-                              DebugLoc DL) const {
-  if (DestRC == ARM::GPRRegisterClass) {
-    if (SrcRC == ARM::GPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVgpr2gpr), DestReg).addReg(SrcReg);
-      return true;
-    } else if (SrcRC == ARM::tGPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVtgpr2gpr), DestReg).addReg(SrcReg);
-      return true;
-    }
-  } else if (DestRC == ARM::tGPRRegisterClass) {
-    if (SrcRC == ARM::GPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVgpr2tgpr), DestReg).addReg(SrcReg);
-      return true;
-    } else if (SrcRC == ARM::tGPRRegisterClass) {
-      BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg).addReg(SrcReg);
-      return true;
+void
+Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
+                                         MachineBasicBlock *NewDest) const {
+  MachineBasicBlock *MBB = Tail->getParent();
+  ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>();
+  if (!AFI->hasITBlocks()) {
+    TargetInstrInfoImpl::ReplaceTailWithBranchTo(Tail, NewDest);
+    return;
+  }
+
+  // If the first instruction of Tail is predicated, we may have to update
+  // the IT instruction.
+  unsigned PredReg = 0;
+  ARMCC::CondCodes CC = llvm::getInstrPredicate(Tail, PredReg);
+  MachineBasicBlock::iterator MBBI = Tail;
+  if (CC != ARMCC::AL)
+    // Expecting at least the t2IT instruction before it.
+    --MBBI;
+
+  // Actually replace the tail.
+  TargetInstrInfoImpl::ReplaceTailWithBranchTo(Tail, NewDest);
+
+  // Fix up IT.
+  if (CC != ARMCC::AL) {
+    MachineBasicBlock::iterator E = MBB->begin();
+    unsigned Count = 4; // At most 4 instructions in an IT block.
+    while (Count && MBBI != E) {
+      if (MBBI->isDebugValue()) {
+        --MBBI;
+        continue;
+      }
+      if (MBBI->getOpcode() == ARM::t2IT) {
+        unsigned Mask = MBBI->getOperand(1).getImm();
+        if (Count == 4)
+          MBBI->eraseFromParent();
+        else {
+          unsigned MaskOn = 1 << Count;
+          unsigned MaskOff = ~(MaskOn - 1);
+          MBBI->getOperand(1).setImm((Mask & MaskOff) | MaskOn);
+        }
+        return;
+      }
+      --MBBI;
+      --Count;
     }
+
+    // Ctrl flow can reach here if branch folding is run before IT block
+    // formation pass.
   }
+}
+
+bool
+Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI) const {
+  unsigned PredReg = 0;
+  return llvm::getITInstrPredicate(MBBI, PredReg) == ARMCC::AL;
+}
 
+bool Thumb2InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
+                                          unsigned NumInstrs) const {
+  return NumInstrs && NumInstrs <= IfCvtLimit;
+}
+  
+bool Thumb2InstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT,
+                    MachineBasicBlock &FMBB, unsigned NumF) const {
+  // FIXME: Catch optimization such as:
+  //        r0 = movne
+  //        r0 = moveq
+  return NumT && NumF &&
+    NumT <= (IfCvtDiamondLimit) && NumF <= (IfCvtDiamondLimit);
+}
+
+void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
   // Handle SPR, DPR, and QPR copies.
-  return ARMBaseInstrInfo::copyRegToReg(MBB, I, DestReg, SrcReg, DestRC, SrcRC, DL);
+  if (!ARM::GPRRegClass.contains(DestReg, SrcReg))
+    return ARMBaseInstrInfo::copyPhysReg(MBB, I, DL, DestReg, SrcReg, KillSrc);
+
+  bool tDest = ARM::tGPRRegClass.contains(DestReg);
+  bool tSrc  = ARM::tGPRRegClass.contains(SrcReg);
+  unsigned Opc = ARM::tMOVgpr2gpr;
+  if (tDest && tSrc)
+    Opc = ARM::tMOVr;
+  else if (tSrc)
+    Opc = ARM::tMOVtgpr2gpr;
+  else if (tDest)
+    Opc = ARM::tMOVgpr2tgpr;
+
+  BuildMI(MBB, I, DL, get(Opc), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
 void Thumb2InstrInfo::
@@ -69,7 +147,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                     unsigned SrcReg, bool isKill, int FI,
                     const TargetRegisterClass *RC,
                     const TargetRegisterInfo *TRI) const {
-  if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass) {
+  if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass ||
+      RC == ARM::tcGPRRegisterClass) {
     DebugLoc DL;
     if (I != MBB.end()) DL = I->getDebugLoc();
 
@@ -94,7 +173,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      unsigned DestReg, int FI,
                      const TargetRegisterClass *RC,
                      const TargetRegisterInfo *TRI) const {
-  if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass) {
+  if (RC == ARM::GPRRegisterClass || RC == ARM::tGPRRegisterClass ||
+      RC == ARM::tcGPRRegisterClass) {
     DebugLoc DL;
     if (I != MBB.end()) DL = I->getDebugLoc();
 
@@ -113,6 +193,11 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI);
 }
 
+ScheduleHazardRecognizer *Thumb2InstrInfo::
+CreateTargetPostRAHazardRecognizer(const InstrItineraryData &II) const {
+  return (ScheduleHazardRecognizer *)new Thumb2HazardRecognizer(II);
+}
+
 void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator &MBBI, DebugLoc dl,
                                unsigned DestReg, unsigned BaseReg, int NumBytes,
@@ -131,14 +216,14 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
       // Use a movw to materialize the 16-bit constant.
       BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), DestReg)
         .addImm(NumBytes)
-        .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+        .addImm((unsigned)Pred).addReg(PredReg);
       Fits = true;
     } else if ((NumBytes & 0xffff) == 0) {
       // Use a movt to materialize the 32-bit constant.
       BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVTi16), DestReg)
         .addReg(DestReg)
         .addImm(NumBytes >> 16)
-        .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+        .addImm((unsigned)Pred).addReg(PredReg);
       Fits = true;
     }
 
@@ -502,3 +587,54 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   Offset = (isSub) ? -Offset : Offset;
   return Offset == 0;
 }
+
+/// scheduleTwoAddrSource - Schedule the copy / re-mat of the source of the
+/// two-addrss instruction inserted by two-address pass.
+void
+Thumb2InstrInfo::scheduleTwoAddrSource(MachineInstr *SrcMI,
+                                       MachineInstr *UseMI,
+                                       const TargetRegisterInfo &TRI) const {
+  if (SrcMI->getOpcode() != ARM::tMOVgpr2gpr ||
+      SrcMI->getOperand(1).isKill())
+    return;
+
+  unsigned PredReg = 0;
+  ARMCC::CondCodes CC = llvm::getInstrPredicate(UseMI, PredReg);
+  if (CC == ARMCC::AL || PredReg != ARM::CPSR)
+    return;
+
+  // Schedule the copy so it doesn't come between previous instructions
+  // and UseMI which can form an IT block.
+  unsigned SrcReg = SrcMI->getOperand(1).getReg();
+  ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC);
+  MachineBasicBlock *MBB = UseMI->getParent();
+  MachineBasicBlock::iterator MBBI = SrcMI;
+  unsigned NumInsts = 0;
+  while (--MBBI != MBB->begin()) {
+    if (MBBI->isDebugValue())
+      continue;
+
+    MachineInstr *NMI = &*MBBI;
+    ARMCC::CondCodes NCC = llvm::getInstrPredicate(NMI, PredReg);
+    if (!(NCC == CC || NCC == OCC) ||
+        NMI->modifiesRegister(SrcReg, &TRI) ||
+        NMI->definesRegister(ARM::CPSR))
+      break;
+    if (++NumInsts == 4)
+      // Too many in a row!
+      return;
+  }
+
+  if (NumInsts) {
+    MBB->remove(SrcMI);
+    MBB->insert(++MBBI, SrcMI);
+  }
+}
+
+ARMCC::CondCodes
+llvm::getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg) {
+  unsigned Opc = MI->getOpcode();
+  if (Opc == ARM::tBcc || Opc == ARM::t2Bcc)
+    return ARMCC::AL;
+  return llvm::getInstrPredicate(MI, PredReg);
+}
diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h
index 29487700d19b..3a9f8b194d3c 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/lib/Target/ARM/Thumb2InstrInfo.h
@@ -20,7 +20,8 @@
 #include "Thumb2RegisterInfo.h"
 
 namespace llvm {
-  class ARMSubtarget;
+class ARMSubtarget;
+class ScheduleHazardRecognizer;
 
 class Thumb2InstrInfo : public ARMBaseInstrInfo {
   Thumb2RegisterInfo RI;
@@ -31,12 +32,21 @@ public:
   // if there is not such an opcode.
   unsigned getUnindexedOpcode(unsigned Opc) const;
 
-  bool copyRegToReg(MachineBasicBlock &MBB,
-                    MachineBasicBlock::iterator I,
-                    unsigned DestReg, unsigned SrcReg,
-                    const TargetRegisterClass *DestRC,
-                    const TargetRegisterClass *SrcRC,
-                    DebugLoc DL) const;
+  void ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
+                               MachineBasicBlock *NewDest) const;
+
+  bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI) const;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumInstrs) const;
+  
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumTInstrs,
+                           MachineBasicBlock &FMBB, unsigned NumFInstrs) const;
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
@@ -50,12 +60,27 @@ public:
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const;
 
+  /// scheduleTwoAddrSource - Schedule the copy / re-mat of the source of the
+  /// two-addrss instruction inserted by two-address pass.
+  void scheduleTwoAddrSource(MachineInstr *SrcMI, MachineInstr *UseMI,
+                             const TargetRegisterInfo &TRI) const;
+
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
   const Thumb2RegisterInfo &getRegisterInfo() const { return RI; }
+
+  ScheduleHazardRecognizer *
+  CreateTargetPostRAHazardRecognizer(const InstrItineraryData &II) const;
 };
+
+/// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical
+/// to llvm::getInstrPredicate except it returns AL for conditional branch
+/// instructions which are "predicated", but are not in IT blocks.
+ARMCC::CondCodes getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg);
+
+
 }
 
 #endif // THUMB2INSTRUCTIONINFO_H
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index 8fe2e42a7cdd..ba392f36d946 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -451,11 +451,18 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
   if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr))
     return false;
 
-  const TargetInstrDesc &TID = MI->getDesc();
   unsigned Reg0 = MI->getOperand(0).getReg();
   unsigned Reg1 = MI->getOperand(1).getReg();
-  if (Reg0 != Reg1)
-    return false;
+  if (Reg0 != Reg1) {
+    // Try to commute the operands to make it a 2-address instruction.
+    unsigned CommOpIdx1, CommOpIdx2;
+    if (!TII->findCommutedOpIndices(MI, CommOpIdx1, CommOpIdx2) ||
+        CommOpIdx1 != 1 || MI->getOperand(CommOpIdx2).getReg() != Reg0)
+      return false;
+    MachineInstr *CommutedMI = TII->commuteInstruction(MI);
+    if (!CommutedMI)
+      return false;
+  }
   if (Entry.LowRegs2 && !isARMLowRegister(Reg0))
     return false;
   if (Entry.Imm2Limit) {
@@ -484,6 +491,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
 
   bool HasCC = false;
   bool CCDead = false;
+  const TargetInstrDesc &TID = MI->getDesc();
   if (TID.hasOptionalDef()) {
     unsigned NumOps = TID.getNumOperands();
     HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR);
@@ -689,7 +697,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
         goto ProcessNext;
       }
 
-      // Try to transform ro a 16-bit non-two-address instruction.
+      // Try to transform to a 16-bit non-two-address instruction.
       if (Entry.NarrowOpc1 && ReduceToNarrow(MBB, MI, Entry, LiveCPSR)) {
         Modified = true;
         MachineBasicBlock::iterator I = prior(NextMII);
-- 
cgit v1.2.3