From 66e41e3c6e8b8fbc48d5d3b4d2bd9ce0be4ecb75 Mon Sep 17 00:00:00 2001
From: Roman Divacky <rdivacky@FreeBSD.org>
Date: Tue, 13 Jul 2010 17:19:57 +0000
Subject: Update LLVM to r108243.

---
 lib/CodeGen/AggressiveAntiDepBreaker.cpp          |   83 +-
 lib/CodeGen/AggressiveAntiDepBreaker.h            |    1 +
 lib/CodeGen/AsmPrinter/AsmPrinter.cpp             |   94 +-
 lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp    |    4 +-
 lib/CodeGen/AsmPrinter/DIE.cpp                    |    2 +
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp             |  246 ++-
 lib/CodeGen/AsmPrinter/DwarfDebug.h               |   18 +-
 lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp         |    6 +-
 lib/CodeGen/BranchFolding.cpp                     |   68 +-
 lib/CodeGen/BranchFolding.h                       |    5 +-
 lib/CodeGen/CMakeLists.txt                        |    7 +-
 lib/CodeGen/CalcSpillWeights.cpp                  |    2 +-
 lib/CodeGen/CallingConvLower.cpp                  |  177 ++
 lib/CodeGen/CodePlacementOpt.cpp                  |    4 +
 lib/CodeGen/CriticalAntiDepBreaker.cpp            |  158 +-
 lib/CodeGen/CriticalAntiDepBreaker.h              |    5 +-
 lib/CodeGen/DwarfEHPrepare.cpp                    |  110 +-
 lib/CodeGen/ELFCodeEmitter.cpp                    |    2 +-
 lib/CodeGen/ExactHazardRecognizer.cpp             |  180 ---
 lib/CodeGen/ExactHazardRecognizer.h               |   86 -
 lib/CodeGen/GCStrategy.cpp                        |    6 +-
 lib/CodeGen/IfConversion.cpp                      |  400 +++--
 lib/CodeGen/InlineSpiller.cpp                     |  408 +++++
 lib/CodeGen/IntrinsicLowering.cpp                 |   51 +-
 lib/CodeGen/LLVMTargetMachine.cpp                 |   15 +-
 lib/CodeGen/LatencyPriorityQueue.cpp              |    2 +-
 lib/CodeGen/LiveInterval.cpp                      |   66 +-
 lib/CodeGen/LiveIntervalAnalysis.cpp              |  236 +--
 lib/CodeGen/LiveStackAnalysis.cpp                 |    4 +-
 lib/CodeGen/LiveVariables.cpp                     |    9 +-
 lib/CodeGen/LowerSubregs.cpp                      |  217 +--
 lib/CodeGen/MachineBasicBlock.cpp                 |  129 +-
 lib/CodeGen/MachineCSE.cpp                        |   46 +-
 lib/CodeGen/MachineDominators.cpp                 |    1 -
 lib/CodeGen/MachineFunction.cpp                   |   16 +-
 lib/CodeGen/MachineInstr.cpp                      |  113 +-
 lib/CodeGen/MachineLICM.cpp                       |  118 +-
 lib/CodeGen/MachineRegisterInfo.cpp               |  130 +-
 lib/CodeGen/MachineSink.cpp                       |  102 +-
 lib/CodeGen/MachineVerifier.cpp                   |    3 +-
 lib/CodeGen/OptimizeExts.cpp                      |   24 +-
 lib/CodeGen/OptimizePHIs.cpp                      |    5 +
 lib/CodeGen/PBQP/HeuristicSolver.h                |    2 +-
 lib/CodeGen/PBQP/Heuristics/Briggs.h              |    5 +-
 lib/CodeGen/PHIElimination.cpp                    |   63 +-
 lib/CodeGen/Passes.cpp                            |   26 +-
 lib/CodeGen/PostRAHazardRecognizer.cpp            |  180 +++
 lib/CodeGen/PostRASchedulerList.cpp               |   51 +-
 lib/CodeGen/PreAllocSplitting.cpp                 |   89 +-
 lib/CodeGen/ProcessImplicitDefs.cpp               |   33 +-
 lib/CodeGen/PrologEpilogInserter.cpp              |   65 +-
 lib/CodeGen/RegAllocFast.cpp                      |  226 ++-
 lib/CodeGen/RegAllocLinearScan.cpp                |   33 +-
 lib/CodeGen/RegAllocLocal.cpp                     | 1254 ---------------
 lib/CodeGen/RegAllocPBQP.cpp                      |   25 +-
 lib/CodeGen/RegisterCoalescer.cpp                 |  156 ++
 lib/CodeGen/RegisterScavenging.cpp                |   33 +-
 lib/CodeGen/ScheduleDAG.cpp                       |   37 +-
 lib/CodeGen/ScheduleDAGEmit.cpp                   |   14 +-
 lib/CodeGen/ScheduleDAGInstrs.h                   |    5 +-
 lib/CodeGen/SelectionDAG/CMakeLists.txt           |    1 -
 lib/CodeGen/SelectionDAG/CallingConvLower.cpp     |  179 ---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp          |  267 +--
 lib/CodeGen/SelectionDAG/FastISel.cpp             |  354 ++--
 lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp |   63 +-
 lib/CodeGen/SelectionDAG/FunctionLoweringInfo.h   |  144 --
 lib/CodeGen/SelectionDAG/InstrEmitter.cpp         |  133 +-
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp          |  335 +++-
 lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp   |   40 +-
 lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp |  150 +-
 lib/CodeGen/SelectionDAG/LegalizeTypes.cpp        |   72 +-
 lib/CodeGen/SelectionDAG/LegalizeTypes.h          |    4 +
 lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp |    8 +-
 lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp    |    2 +-
 lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp  |  140 +-
 lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp      |    2 +-
 lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp    |   17 +-
 lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp   |  241 +--
 lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h     |    5 +-
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp         |  158 +-
 lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp  | 1248 +++++++-------
 lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h    |   12 +
 lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp     |  421 ++---
 lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp  |    2 +-
 lib/CodeGen/SelectionDAG/TargetLowering.cpp       |  218 ++-
 lib/CodeGen/ShadowStackGC.cpp                     |   14 +-
 lib/CodeGen/SimpleHazardRecognizer.h              |   89 -
 lib/CodeGen/SimpleRegisterCoalescing.cpp          | 1790 +++++----------------
 lib/CodeGen/SimpleRegisterCoalescing.h            |   84 +-
 lib/CodeGen/SjLjEHPrepare.cpp                     |  114 +-
 lib/CodeGen/SlotIndexes.cpp                       |    4 +-
 lib/CodeGen/Spiller.cpp                           |  209 ++-
 lib/CodeGen/Spiller.h                             |   18 +-
 lib/CodeGen/StackProtector.cpp                    |   16 +-
 lib/CodeGen/StackSlotColoring.cpp                 |   14 +-
 lib/CodeGen/StrongPHIElimination.cpp              |   22 +-
 lib/CodeGen/TailDuplication.cpp                   |   18 +-
 lib/CodeGen/TargetInstrInfoImpl.cpp               |  182 ++-
 lib/CodeGen/TargetLoweringObjectFileImpl.cpp      |  112 +-
 lib/CodeGen/TwoAddressInstructionPass.cpp         |  303 +++-
 lib/CodeGen/VirtRegRewriter.cpp                   |  110 +-
 101 files changed, 6179 insertions(+), 6792 deletions(-)
 create mode 100644 lib/CodeGen/CallingConvLower.cpp
 delete mode 100644 lib/CodeGen/ExactHazardRecognizer.cpp
 delete mode 100644 lib/CodeGen/ExactHazardRecognizer.h
 create mode 100644 lib/CodeGen/InlineSpiller.cpp
 create mode 100644 lib/CodeGen/PostRAHazardRecognizer.cpp
 delete mode 100644 lib/CodeGen/RegAllocLocal.cpp
 delete mode 100644 lib/CodeGen/SelectionDAG/CallingConvLower.cpp
 delete mode 100644 lib/CodeGen/SelectionDAG/FunctionLoweringInfo.h
 delete mode 100644 lib/CodeGen/SimpleHazardRecognizer.h

(limited to 'lib/CodeGen')

diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 4008a6a63cf8..a7189acc3fec 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -114,6 +115,7 @@ AggressiveAntiDepBreaker(MachineFunction& MFi,
                          TargetSubtarget::RegClassVector& CriticalPathRCs) :
   AntiDepBreaker(), MF(MFi),
   MRI(MF.getRegInfo()),
+  TII(MF.getTarget().getInstrInfo()),
   TRI(MF.getTarget().getRegisterInfo()),
   AllocatableSet(TRI->getAllocatableSet(MF)),
   State(NULL) {
@@ -163,25 +165,27 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
         DefIndices[AliasReg] = ~0u;
       }
     }
-  } else {
-    // In a non-return block, examine the live-in regs of all successors.
-    for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+  }
+
+  // In a non-return block, examine the live-in regs of all successors.
+  // Note a return block can have successors if the return instruction is
+  // predicated.
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
          SE = BB->succ_end(); SI != SE; ++SI)
-      for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
+    for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
            E = (*SI)->livein_end(); I != E; ++I) {
-        unsigned Reg = *I;
-        State->UnionGroups(Reg, 0);
-        KillIndices[Reg] = BB->size();
-        DefIndices[Reg] = ~0u;
-        // Repeat, for all aliases.
-        for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
-          unsigned AliasReg = *Alias;
-          State->UnionGroups(AliasReg, 0);
-          KillIndices[AliasReg] = BB->size();
-          DefIndices[AliasReg] = ~0u;
-        }
+      unsigned Reg = *I;
+      State->UnionGroups(Reg, 0);
+      KillIndices[Reg] = BB->size();
+      DefIndices[Reg] = ~0u;
+      // Repeat, for all aliases.
+      for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+        unsigned AliasReg = *Alias;
+        State->UnionGroups(AliasReg, 0);
+        KillIndices[AliasReg] = BB->size();
+        DefIndices[AliasReg] = ~0u;
       }
-  }
+    }
 
   // Mark live-out callee-saved registers. In a return block this is
   // all callee-saved registers. In non-return this is any
@@ -390,7 +394,8 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
     // If MI's defs have a special allocation requirement, don't allow
     // any def registers to be changed. Also assume all registers
     // defined in a call must not be changed (ABI).
-    if (MI->getDesc().isCall() || MI->getDesc().hasExtraDefRegAllocReq()) {
+    if (MI->getDesc().isCall() || MI->getDesc().hasExtraDefRegAllocReq() ||
+        TII->isPredicated(MI)) {
       DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
       State->UnionGroups(Reg, 0);
     }
@@ -443,6 +448,26 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI,
   std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
     RegRefs = State->GetRegRefs();
 
+  // If MI's uses have special allocation requirement, don't allow
+  // any use registers to be changed. Also assume all registers
+  // used in a call must not be changed (ABI).
+  // FIXME: The issue with predicated instruction is more complex. We are being
+  // conservatively here because the kill markers cannot be trusted after
+  // if-conversion:
+  // %R6<def> = LDR %SP, %reg0, 92, pred:14, pred:%reg0; mem:LD4[FixedStack14]
+  // ...
+  // STR %R0, %R6<kill>, %reg0, 0, pred:0, pred:%CPSR; mem:ST4[%395]
+  // %R6<def> = LDR %SP, %reg0, 100, pred:0, pred:%CPSR; mem:LD4[FixedStack12]
+  // STR %R0, %R6<kill>, %reg0, 0, pred:14, pred:%reg0; mem:ST4[%396](align=8)
+  //
+  // The first R6 kill is not really a kill since it's killed by a predicated
+  // instruction which may not be executed. The second R6 def may or may not
+  // re-define R6 so it's not safe to change it since the last R6 use cannot be
+  // changed.
+  bool Special = MI->getDesc().isCall() ||
+    MI->getDesc().hasExtraSrcRegAllocReq() ||
+    TII->isPredicated(MI);
+
   // Scan the register uses for this instruction and update
   // live-ranges, groups and RegRefs.
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
@@ -459,10 +484,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI,
     // for the register.
     HandleLastUse(Reg, Count, "(last-use)");
 
-    // If MI's uses have special allocation requirement, don't allow
-    // any use registers to be changed. Also assume all registers
-    // used in a call must not be changed (ABI).
-    if (MI->getDesc().isCall() || MI->getDesc().hasExtraSrcRegAllocReq()) {
+    if (Special) {
       DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
       State->UnionGroups(Reg, 0);
     }
@@ -604,8 +626,12 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
   // order. If that register is available, and the corresponding
   // registers are available for the other group subregisters, then we
   // can use those registers to rename.
+
+  // FIXME: Using getMinimalPhysRegClass is very conservative. We should
+  // check every use of the register and find the largest register class
+  // that can be used in all of them.
   const TargetRegisterClass *SuperRC =
-    TRI->getPhysicalRegisterRegClass(SuperReg, MVT::Other);
+    TRI->getMinimalPhysRegClass(SuperReg, MVT::Other);
 
   const TargetRegisterClass::iterator RB = SuperRC->allocation_order_begin(MF);
   const TargetRegisterClass::iterator RE = SuperRC->allocation_order_end(MF);
@@ -905,6 +931,19 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
                  AggressiveAntiDepState::RegisterReference>::iterator
                    Q = Range.first, QE = Range.second; Q != QE; ++Q) {
               Q->second.Operand->setReg(NewReg);
+              // If the SU for the instruction being updated has debug
+              // information related to the anti-dependency register, make
+              // sure to update that as well.
+              const SUnit *SU = MISUnitMap[Q->second.Operand->getParent()];
+              if (!SU) continue;
+              for (unsigned i = 0, e = SU->DbgInstrList.size() ; i < e ; ++i) {
+                MachineInstr *DI = SU->DbgInstrList[i];
+                assert (DI->getNumOperands()==3 && DI->getOperand(0).isReg() &&
+                        DI->getOperand(0).getReg()
+                        && "Non register dbg_value attached to SUnit!");
+                if (DI->getOperand(0).getReg() == AntiDepReg)
+                  DI->getOperand(0).setReg(NewReg);
+              }
             }
 
             // We just went back in time and modified history; the
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.h b/lib/CodeGen/AggressiveAntiDepBreaker.h
index 506d43e7f3fc..91ebb850d19d 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.h
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.h
@@ -115,6 +115,7 @@ namespace llvm {
   class AggressiveAntiDepBreaker : public AntiDepBreaker {
     MachineFunction& MF;
     MachineRegisterInfo &MRI;
+    const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
 
     /// AllocatableSet - The set of allocatable registers.
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 5a0c27b300ab..d9387a8e72c5 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -199,7 +199,7 @@ void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
   case GlobalValue::LinkOnceODRLinkage:
   case GlobalValue::WeakAnyLinkage:
   case GlobalValue::WeakODRLinkage:
-  case GlobalValue::LinkerPrivateLinkage:
+  case GlobalValue::LinkerPrivateWeakLinkage:
     if (MAI->getWeakDefDirective() != 0) {
       // .globl _foo
       OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
@@ -225,6 +225,7 @@ void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
     break;
   case GlobalValue::PrivateLinkage:
   case GlobalValue::InternalLinkage:
+  case GlobalValue::LinkerPrivateLinkage:
     break;
   default:
     llvm_unreachable("Unknown linkage type!");
@@ -330,7 +331,6 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     else if (GVKind.isThreadData()) {
       OutStreamer.SwitchSection(TheSection);
 
-      EmitLinkage(GV->getLinkage(), MangSym);
       EmitAlignment(AlignLog, GV);      
       OutStreamer.EmitLabel(MangSym);
       
@@ -353,7 +353,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     //   - spare pointer, used when mapped by the runtime
     //   - pointer to mangled symbol above with initializer
     unsigned PtrSize = TD->getPointerSizeInBits()/8;
-    OutStreamer.EmitSymbolValue(GetExternalSymbolSymbol("__tlv_bootstrap"),
+    OutStreamer.EmitSymbolValue(GetExternalSymbolSymbol("_tlv_bootstrap"),
                           PtrSize, 0);
     OutStreamer.EmitIntValue(0, PtrSize, 0);
     OutStreamer.EmitSymbolValue(MangSym, PtrSize, 0);
@@ -428,20 +428,12 @@ void AsmPrinter::EmitFunctionHeader() {
   
   // Emit pre-function debug and/or EH information.
   if (DE) {
-    if (TimePassesIsEnabled) {
-      NamedRegionTimer T(EHTimerName, DWARFGroupName);
-      DE->BeginFunction(MF);
-    } else {
-      DE->BeginFunction(MF);
-    }
+    NamedRegionTimer T(EHTimerName, DWARFGroupName, TimePassesIsEnabled);
+    DE->BeginFunction(MF);
   }
   if (DD) {
-    if (TimePassesIsEnabled) {
-      NamedRegionTimer T(DbgTimerName, DWARFGroupName);
-      DD->beginFunction(MF);
-    } else {
-      DD->beginFunction(MF);
-    }
+    NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
+    DD->beginFunction(MF);
   }
 }
 
@@ -458,14 +450,11 @@ void AsmPrinter::EmitFunctionEntryLabel() {
 }
 
 
-/// EmitComments - Pretty-print comments for instructions.
-static void EmitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
-  const MachineFunction *MF = MI.getParent()->getParent();
-  const TargetMachine &TM = MF->getTarget();
-  
-  DebugLoc DL = MI.getDebugLoc();
+static void EmitDebugLoc(DebugLoc DL, const MachineFunction *MF, 
+                         raw_ostream &CommentOS) {
+  const LLVMContext &Ctx = MF->getFunction()->getContext();
   if (!DL.isUnknown()) {          // Print source line info.
-    DIScope Scope(DL.getScope(MF->getFunction()->getContext()));
+    DIScope Scope(DL.getScope(Ctx));
     // Omit the directory, because it's likely to be long and uninteresting.
     if (Scope.Verify())
       CommentOS << Scope.getFilename();
@@ -474,6 +463,23 @@ static void EmitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
     CommentOS << ':' << DL.getLine();
     if (DL.getCol() != 0)
       CommentOS << ':' << DL.getCol();
+    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(DL.getInlinedAt(Ctx));
+    if (!InlinedAtDL.isUnknown()) {
+      CommentOS << "[ ";
+      EmitDebugLoc(InlinedAtDL, MF, CommentOS);
+      CommentOS << " ]";
+    }
+  }
+}
+
+/// EmitComments - Pretty-print comments for instructions.
+static void EmitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const TargetMachine &TM = MF->getTarget();
+  
+  DebugLoc DL = MI.getDebugLoc();
+  if (!DL.isUnknown()) {          // Print source line info.
+    EmitDebugLoc(DL, MF, CommentOS);
     CommentOS << '\n';
   }
   
@@ -611,12 +617,8 @@ void AsmPrinter::EmitFunctionBody() {
       }
 
       if (ShouldPrintDebugScopes) {
-	if (TimePassesIsEnabled) {
-	  NamedRegionTimer T(DbgTimerName, DWARFGroupName);
-	  DD->beginScope(II);
-	} else {
-	  DD->beginScope(II);
-	}
+        NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
+        DD->beginScope(II);
       }
       
       if (isVerbose())
@@ -649,12 +651,8 @@ void AsmPrinter::EmitFunctionBody() {
       }
       
       if (ShouldPrintDebugScopes) {
-	if (TimePassesIsEnabled) {
-	  NamedRegionTimer T(DbgTimerName, DWARFGroupName);
-	  DD->endScope(II);
-	} else {
-	  DD->endScope(II);
-	}
+        NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
+        DD->endScope(II);
       }
     }
   }
@@ -692,20 +690,12 @@ void AsmPrinter::EmitFunctionBody() {
   
   // Emit post-function debug information.
   if (DD) {
-    if (TimePassesIsEnabled) {
-      NamedRegionTimer T(DbgTimerName, DWARFGroupName);
-      DD->endFunction(MF);
-    } else {
-      DD->endFunction(MF);
-    }
+    NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
+    DD->endFunction(MF);
   }
   if (DE) {
-    if (TimePassesIsEnabled) {
-      NamedRegionTimer T(EHTimerName, DWARFGroupName);
-      DE->EndFunction();
-    } else {
-      DE->EndFunction();
-    }
+    NamedRegionTimer T(EHTimerName, DWARFGroupName, TimePassesIsEnabled);
+    DE->EndFunction();
   }
   MMI->EndFunction();
   
@@ -730,19 +720,15 @@ bool AsmPrinter::doFinalization(Module &M) {
   
   // Finalize debug and EH information.
   if (DE) {
-    if (TimePassesIsEnabled) {
-      NamedRegionTimer T(EHTimerName, DWARFGroupName);
-      DE->EndModule();
-    } else {
+    {
+      NamedRegionTimer T(EHTimerName, DWARFGroupName, TimePassesIsEnabled);
       DE->EndModule();
     }
     delete DE; DE = 0;
   }
   if (DD) {
-    if (TimePassesIsEnabled) {
-      NamedRegionTimer T(DbgTimerName, DWARFGroupName);
-      DD->endModule();
-    } else {
+    {
+      NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
       DD->endModule();
     }
     delete DD; DD = 0;
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index ba6fed2a78ba..f6f3bae42a80 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -83,7 +83,7 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, unsigned LocCookie) const {
   // Tell SrcMgr about this buffer, it takes ownership of the buffer.
   SrcMgr.AddNewSourceBuffer(Buffer, SMLoc());
   
-  AsmParser Parser(SrcMgr, OutContext, OutStreamer, *MAI);
+  AsmParser Parser(TM.getTarget(), SrcMgr, OutContext, OutStreamer, *MAI);
   OwningPtr<TargetAsmParser> TAP(TM.getTarget().createAsmParser(Parser));
   if (!TAP)
     report_fatal_error("Inline asm not supported by this streamer because"
@@ -279,7 +279,7 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
       // Okay, we finally have a value number.  Ask the target to print this
       // operand!
       if (CurVariant == -1 || CurVariant == AsmPrinterVariant) {
-        unsigned OpNo = 1;
+        unsigned OpNo = 2;
 
         bool Error = false;
 
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index b2c70d51f5a5..21396ca37f06 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -201,6 +201,7 @@ void DIEInteger::EmitValue(AsmPrinter *Asm, unsigned Form) const {
   case dwarf::DW_FORM_data8: Size = 8; break;
   case dwarf::DW_FORM_udata: Asm->EmitULEB128(Integer); return;
   case dwarf::DW_FORM_sdata: Asm->EmitSLEB128(Integer); return;
+  case dwarf::DW_FORM_addr:  Size = Asm->getTargetData().getPointerSize(); break;
   default: llvm_unreachable("DIE Value form not supported yet");
   }
   Asm->OutStreamer.EmitIntValue(Integer, Size, 0/*addrspace*/);
@@ -221,6 +222,7 @@ unsigned DIEInteger::SizeOf(AsmPrinter *AP, unsigned Form) const {
   case dwarf::DW_FORM_data8: return sizeof(int64_t);
   case dwarf::DW_FORM_udata: return MCAsmInfo::getULEB128Size(Integer);
   case dwarf::DW_FORM_sdata: return MCAsmInfo::getSLEB128Size(Integer);
+  case dwarf::DW_FORM_addr:  return AP->getTargetData().getPointerSize();
   default: llvm_unreachable("DIE Value form not supported yet"); break;
   }
   return 0;
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 890507cf3148..65c1d190216f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -44,7 +44,8 @@ using namespace llvm;
 static cl::opt<bool> PrintDbgScope("print-dbgscope", cl::Hidden,
      cl::desc("Print DbgScope information for each machine instruction"));
 
-static cl::opt<bool> DisableDebugInfoPrinting("disable-debug-info-print", cl::Hidden,
+static cl::opt<bool> DisableDebugInfoPrinting("disable-debug-info-print", 
+                                              cl::Hidden,
      cl::desc("Disable debug info printing"));
 
 static cl::opt<bool> UnknownLocations("use-unknown-locations", cl::Hidden,
@@ -79,15 +80,13 @@ class CompileUnit {
   /// IndexTyDie - An anonymous type for index type.  Owned by CUDie.
   DIE *IndexTyDie;
 
-  /// GVToDieMap - Tracks the mapping of unit level debug informaton
+  /// MDNodeToDieMap - Tracks the mapping of unit level debug informaton
   /// variables to debug information entries.
-  /// FIXME : Rename GVToDieMap -> NodeToDieMap
-  DenseMap<const MDNode *, DIE *> GVToDieMap;
+  DenseMap<const MDNode *, DIE *> MDNodeToDieMap;
 
-  /// GVToDIEEntryMap - Tracks the mapping of unit level debug informaton
+  /// MDNodeToDIEEntryMap - Tracks the mapping of unit level debug informaton
   /// descriptors to debug information entries using a DIEEntry proxy.
-  /// FIXME : Rename
-  DenseMap<const MDNode *, DIEEntry *> GVToDIEEntryMap;
+  DenseMap<const MDNode *, DIEEntry *> MDNodeToDIEEntryMap;
 
   /// Globals - A map of globally visible named entities for this unit.
   ///
@@ -123,25 +122,25 @@ public:
 
   /// getDIE - Returns the debug information entry map slot for the
   /// specified debug variable.
-  DIE *getDIE(const MDNode *N) { return GVToDieMap.lookup(N); }
+  DIE *getDIE(const MDNode *N) { return MDNodeToDieMap.lookup(N); }
 
   /// insertDIE - Insert DIE into the map.
   void insertDIE(const MDNode *N, DIE *D) {
-    GVToDieMap.insert(std::make_pair(N, D));
+    MDNodeToDieMap.insert(std::make_pair(N, D));
   }
 
   /// getDIEEntry - Returns the debug information entry for the speciefied
   /// debug variable.
   DIEEntry *getDIEEntry(const MDNode *N) { 
-    DenseMap<const MDNode *, DIEEntry *>::iterator I = GVToDIEEntryMap.find(N);
-    if (I == GVToDIEEntryMap.end())
+    DenseMap<const MDNode *, DIEEntry *>::iterator I = MDNodeToDIEEntryMap.find(N);
+    if (I == MDNodeToDIEEntryMap.end())
       return NULL;
     return I->second;
   }
 
   /// insertDIEEntry - Insert debug information entry into the map.
   void insertDIEEntry(const MDNode *N, DIEEntry *E) {
-    GVToDIEEntryMap.insert(std::make_pair(N, E));
+    MDNodeToDIEEntryMap.insert(std::make_pair(N, E));
   }
 
   /// addDie - Adds or interns the DIE to the compile unit.
@@ -321,12 +320,12 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   DwarfFrameSectionSym = DwarfInfoSectionSym = DwarfAbbrevSectionSym = 0;
   DwarfStrSectionSym = TextSectionSym = 0;
   DwarfDebugRangeSectionSym = DwarfDebugLocSectionSym = 0; 
+  DwarfDebugLineSectionSym = CurrentLineSectionSym = 0;
   FunctionBeginSym = FunctionEndSym = 0;
-  if (TimePassesIsEnabled) {
-      NamedRegionTimer T(DbgTimerName, DWARFGroupName);
-      beginModule(M);
-  } else {
-      beginModule(M);
+  DIEIntegerOne = new (DIEValueAllocator) DIEInteger(1);
+  {
+    NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
+    beginModule(M);
   }
 }
 DwarfDebug::~DwarfDebug() {
@@ -378,7 +377,8 @@ DIEEntry *DwarfDebug::createDIEEntry(DIE *Entry) {
 void DwarfDebug::addUInt(DIE *Die, unsigned Attribute,
                          unsigned Form, uint64_t Integer) {
   if (!Form) Form = DIEInteger::BestForm(false, Integer);
-  DIEValue *Value = new (DIEValueAllocator) DIEInteger(Integer);
+  DIEValue *Value = Integer == 1 ? 
+    DIEIntegerOne : new (DIEValueAllocator) DIEInteger(Integer);
   Die->addValue(Attribute, Form, Value);
 }
 
@@ -866,6 +866,10 @@ void DwarfDebug::addToContextOwner(DIE *Die, DIDescriptor Context) {
   } else if (Context.isNameSpace()) {
     DIE *ContextDIE = getOrCreateNameSpace(DINameSpace(Context));
     ContextDIE->addChild(Die);
+  } else if (Context.isSubprogram()) {
+    DIE *ContextDIE = createSubprogramDIE(DISubprogram(Context),
+                                          /*MakeDecl=*/false);
+    ContextDIE->addChild(Die);
   } else if (DIE *ContextDIE = getCompileUnit(Context)->getDIE(Context))
     ContextDIE->addChild(Die);
   else 
@@ -1055,6 +1059,10 @@ void DwarfDebug::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     if (DIDescriptor(ContainingType).isCompositeType())
       addDIEEntry(&Buffer, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, 
                   getOrCreateTypeDIE(DIType(ContainingType)));
+    else {
+      DIDescriptor Context = CTy.getContext();
+      addToContextOwner(&Buffer, Context);
+    }
     break;
   }
   default:
@@ -1065,8 +1073,9 @@ void DwarfDebug::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
   if (!Name.empty())
     addString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
 
-  if (Tag == dwarf::DW_TAG_enumeration_type || Tag == dwarf::DW_TAG_class_type ||
-      Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type) {
+  if (Tag == dwarf::DW_TAG_enumeration_type || Tag == dwarf::DW_TAG_class_type 
+      || Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type)
+    {
     // Add size if non-zero (derived types might be zero-sized.)
     if (Size)
       addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
@@ -1329,6 +1338,9 @@ DIE *DwarfDebug::createSubprogramDIE(const DISubprogram &SP, bool MakeDecl) {
   // DW_TAG_inlined_subroutine may refer to this DIE.
   SPCU->insertDIE(SP, SPDie);
 
+  // Add to context owner.
+  addToContextOwner(SPDie, SP.getContext());
+
   return SPDie;
 }
 
@@ -1379,6 +1391,7 @@ static bool isSubprogramContext(const MDNode *Context) {
 DIE *DwarfDebug::updateSubprogramScopeDIE(const MDNode *SPNode) {
   CompileUnit *SPCU = getCompileUnit(SPNode);
   DIE *SPDie = SPCU->getDIE(SPNode);
+
   assert(SPDie && "Unable to find subprogram DIE!");
   DISubprogram SP(SPNode);
   
@@ -1412,6 +1425,14 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(const MDNode *SPNode) {
     SPCU->addDie(SPDie);
   }
   
+  // Pick up abstract subprogram DIE.
+  if (DIE *AbsSPDIE = AbstractSPDies.lookup(SPNode)) {
+    SPDie = new DIE(dwarf::DW_TAG_subprogram);
+    addDIEEntry(SPDie, dwarf::DW_AT_abstract_origin,
+                dwarf::DW_FORM_ref4, AbsSPDIE);
+    SPCU->addDie(SPDie);
+  }
+
   addLabel(SPDie, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
            Asm->GetTempSymbol("func_begin", Asm->getFunctionNumber()));
   addLabel(SPDie, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr,
@@ -1483,7 +1504,7 @@ DIE *DwarfDebug::constructInlinedScopeDIE(DbgScope *Scope) {
   const MCSymbol *StartLabel = getLabelBeforeInsn(RI->first);
   const MCSymbol *EndLabel = getLabelAfterInsn(RI->second);
 
-  if (StartLabel == FunctionBeginSym || EndLabel == 0) {
+  if (StartLabel == 0 || EndLabel == 0) {
     assert (0 && "Unexpected Start and End  labels for a inlined scope!");
     return 0;
   }
@@ -1605,11 +1626,13 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
     // FIXME : Handle getNumOperands != 3 
     if (DVInsn->getNumOperands() == 3) {
       if (DVInsn->getOperand(0).isReg())
-        updated = addRegisterAddress(VariableDie, DVLabel, DVInsn->getOperand(0));
+        updated = 
+          addRegisterAddress(VariableDie, DVLabel, DVInsn->getOperand(0));
       else if (DVInsn->getOperand(0).isImm())
         updated = addConstantValue(VariableDie, DVLabel, DVInsn->getOperand(0));
       else if (DVInsn->getOperand(0).isFPImm()) 
-        updated = addConstantFPValue(VariableDie, DVLabel, DVInsn->getOperand(0));
+        updated = 
+          addConstantFPValue(VariableDie, DVLabel, DVInsn->getOperand(0));
     } else {
       MachineLocation Location = Asm->getDebugValueLocation(DVInsn);
       if (Location.getReg()) {
@@ -1682,8 +1705,13 @@ DIE *DwarfDebug::constructScopeDIE(DbgScope *Scope) {
   if (Scope->getInlinedAt())
     ScopeDIE = constructInlinedScopeDIE(Scope);
   else if (DS.isSubprogram()) {
-    if (Scope->isAbstractScope())
+    ProcessedSPNodes.insert(DS);
+    if (Scope->isAbstractScope()) {
       ScopeDIE = getCompileUnit(DS)->getDIE(DS);
+      // Note down abstract DIE.
+      if (ScopeDIE)
+        AbstractSPDies.insert(std::make_pair(DS, ScopeDIE));
+    }
     else
       ScopeDIE = updateSubprogramScopeDIE(DS);
   }
@@ -1782,11 +1810,11 @@ void DwarfDebug::constructCompileUnit(const MDNode *N) {
   addString(Die, dwarf::DW_AT_name, dwarf::DW_FORM_string, FN);
   // Use DW_AT_entry_pc instead of DW_AT_low_pc/DW_AT_high_pc pair. This
   // simplifies debug range entries.
-  addUInt(Die, dwarf::DW_AT_entry_pc, dwarf::DW_FORM_data4, 0);
+  addUInt(Die, dwarf::DW_AT_entry_pc, dwarf::DW_FORM_addr, 0);
   // DW_AT_stmt_list is a offset of line number information for this
-  // compile unit in debug_line section. It is always zero when only one
-  // compile unit is emitted in one object file.
-  addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0);
+  // compile unit in debug_line section. This offset is calculated 
+  // during endMoudle().
+  addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0);
 
   if (!Dir.empty())
     addString(Die, dwarf::DW_AT_comp_dir, dwarf::DW_FORM_string, Dir);
@@ -1996,6 +2024,40 @@ void DwarfDebug::beginModule(Module *M) {
 ///
 void DwarfDebug::endModule() {
   if (!FirstCU) return;
+  const Module *M = MMI->getModule();
+  if (NamedMDNode *AllSPs = M->getNamedMetadata("llvm.dbg.sp")) {
+    for (unsigned SI = 0, SE = AllSPs->getNumOperands(); SI != SE; ++SI) {
+      if (ProcessedSPNodes.count(AllSPs->getOperand(SI)) != 0) continue;
+      DISubprogram SP(AllSPs->getOperand(SI));
+      if (!SP.Verify()) continue;
+
+      // Collect info for variables that were optimized out.
+      StringRef FName = SP.getLinkageName();
+      if (FName.empty())
+        FName = SP.getName();
+      NamedMDNode *NMD = 
+        M->getNamedMetadata(Twine("llvm.dbg.lv.", getRealLinkageName(FName)));
+      if (!NMD) continue;
+      unsigned E = NMD->getNumOperands();
+      if (!E) continue;
+      DbgScope *Scope = new DbgScope(NULL, DIDescriptor(SP), NULL);
+      for (unsigned I = 0; I != E; ++I) {
+        DIVariable DV(NMD->getOperand(I));
+        if (!DV.Verify()) continue;
+        Scope->addVariable(new DbgVariable(DV));
+      }
+      
+      // Construct subprogram DIE and add variables DIEs.
+      constructSubprogramDIE(SP);
+      DIE *ScopeDIE = getCompileUnit(SP)->getDIE(SP);
+      const SmallVector<DbgVariable *, 8> &Variables = Scope->getVariables();
+      for (unsigned i = 0, N = Variables.size(); i < N; ++i) {
+        DIE *VariableDIE = constructVariableDIE(Variables[i], Scope);
+        if (VariableDIE)
+          ScopeDIE->addChild(VariableDIE);
+      }
+    }
+  }
 
   // Attach DW_AT_inline attribute with inlined subprogram DIEs.
   for (SmallPtrSet<DIE *, 4>::iterator AI = InlinedSubprogramDIEs.begin(),
@@ -2037,15 +2099,15 @@ void DwarfDebug::endModule() {
   // Compute DIE offsets and sizes.
   computeSizeAndOffsets();
 
+  // Emit source line correspondence into a debug line section.
+  emitDebugLines();
+
   // Emit all the DIEs into a debug info section
   emitDebugInfo();
 
   // Corresponding abbreviations into a abbrev section.
   emitAbbreviations();
 
-  // Emit source line correspondence into a debug line section.
-  emitDebugLines();
-
   // Emit info into a debug pubnames section.
   emitDebugPubNames();
 
@@ -2150,8 +2212,9 @@ static bool isDbgValueInDefinedReg(const MachineInstr *MI) {
 }
 
 /// collectVariableInfo - Populate DbgScope entries with variables' info.
-void DwarfDebug::collectVariableInfo(const MachineFunction *MF) {
-  SmallPtrSet<const MDNode *, 16> Processed;
+void 
+DwarfDebug::collectVariableInfo(const MachineFunction *MF,
+                                SmallPtrSet<const MDNode *, 16> &Processed) {
   
   /// collection info from MMI table.
   collectVariableInfoFromMMITable(MF, Processed);
@@ -2180,16 +2243,23 @@ void DwarfDebug::collectVariableInfo(const MachineFunction *MF) {
     if (Processed.count(DV) != 0)
       continue;
 
+    const MachineInstr *PrevMI = MInsn;
     for (SmallVector<const MachineInstr *, 8>::iterator MI = I+1, 
            ME = DbgValues.end(); MI != ME; ++MI) {
       const MDNode *Var = 
         (*MI)->getOperand((*MI)->getNumOperands()-1).getMetadata();
-      if (Var == DV && isDbgValueInDefinedReg(*MI))
+      if (Var == DV && isDbgValueInDefinedReg(*MI) && 
+          !PrevMI->isIdenticalTo(*MI))
         MultipleValues.push_back(*MI);
+      PrevMI = *MI;
     }
 
     DbgScope *Scope = findDbgScope(MInsn);
-    if (!Scope && DV.getTag() == dwarf::DW_TAG_arg_variable)
+    bool CurFnArg = false;
+    if (DV.getTag() == dwarf::DW_TAG_arg_variable &&
+        DISubprogram(DV.getContext()).describes(MF->getFunction()))
+      CurFnArg = true;
+    if (!Scope && CurFnArg)
       Scope = CurrentFnDbgScope;
     // If variable scope is not found then skip this variable.
     if (!Scope)
@@ -2198,7 +2268,7 @@ void DwarfDebug::collectVariableInfo(const MachineFunction *MF) {
     Processed.insert(DV);
     DbgVariable *RegVar = new DbgVariable(DV);
     Scope->addVariable(RegVar);
-    if (DV.getTag() != dwarf::DW_TAG_arg_variable)
+    if (!CurFnArg)
       DbgVariableLabelsMap[RegVar] = getLabelBeforeInsn(MInsn); 
     if (DbgVariable *AbsVar = findAbstractVariable(DV, MInsn->getDebugLoc())) {
       DbgVariableToDbgInstMap[AbsVar] = MInsn;
@@ -2217,7 +2287,8 @@ void DwarfDebug::collectVariableInfo(const MachineFunction *MF) {
     const MachineInstr *Begin = NULL;
     const MachineInstr *End = NULL;
     for (SmallVector<const MachineInstr *, 4>::iterator 
-           MVI = MultipleValues.begin(), MVE = MultipleValues.end(); MVI != MVE; ++MVI) {
+           MVI = MultipleValues.begin(), MVE = MultipleValues.end(); 
+         MVI != MVE; ++MVI) {
       if (!Begin) {
         Begin = *MVI;
         continue;
@@ -2241,8 +2312,11 @@ void DwarfDebug::collectVariableInfo(const MachineFunction *MF) {
   }
 
   // Collect info for variables that were optimized out.
+  const Function *F = MF->getFunction();
+  const Module *M = F->getParent();
   if (NamedMDNode *NMD = 
-      MF->getFunction()->getParent()->getNamedMetadata("llvm.dbg.lv")) {
+      M->getNamedMetadata(Twine("llvm.dbg.lv.", 
+                                getRealLinkageName(F->getName())))) {
     for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
       DIVariable DV(cast_or_null<MDNode>(NMD->getOperand(i)));
       if (!DV || !Processed.insert(DV))
@@ -2319,7 +2393,8 @@ void DwarfDebug::endScope(const MachineInstr *MI) {
 }
 
 /// getOrCreateDbgScope - Create DbgScope for the scope.
-DbgScope *DwarfDebug::getOrCreateDbgScope(const MDNode *Scope, const MDNode *InlinedAt) {
+DbgScope *DwarfDebug::getOrCreateDbgScope(const MDNode *Scope, 
+                                          const MDNode *InlinedAt) {
   if (!InlinedAt) {
     DbgScope *WScope = DbgScopeMap.lookup(Scope);
     if (WScope)
@@ -2335,13 +2410,20 @@ DbgScope *DwarfDebug::getOrCreateDbgScope(const MDNode *Scope, const MDNode *Inl
 
     if (!WScope->getParent()) {
       StringRef SPName = DISubprogram(Scope).getLinkageName();
-      if (SPName == Asm->MF->getFunction()->getName())
+      // We used to check only for a linkage name, but that fails
+      // since we began omitting the linkage name for private
+      // functions.  The new way is to check for the name in metadata,
+      // but that's not supported in old .ll test cases.  Ergo, we
+      // check both.
+      if (SPName == Asm->MF->getFunction()->getName() ||
+          DISubprogram(Scope).getFunction() == Asm->MF->getFunction())
         CurrentFnDbgScope = WScope;
     }
     
     return WScope;
   }
 
+  getOrCreateAbstractScope(Scope);
   DbgScope *WScope = DbgScopeMap.lookup(InlinedAt);
   if (WScope)
     return WScope;
@@ -2355,7 +2437,6 @@ DbgScope *DwarfDebug::getOrCreateDbgScope(const MDNode *Scope, const MDNode *Inl
   Parent->addScope(WScope);
 
   ConcreteScopes[InlinedAt] = WScope;
-  getOrCreateAbstractScope(Scope);
 
   return WScope;
 }
@@ -2365,8 +2446,6 @@ DbgScope *DwarfDebug::getOrCreateDbgScope(const MDNode *Scope, const MDNode *Inl
 static bool hasValidLocation(LLVMContext &Ctx,
                              const MachineInstr *MInsn,
                              const MDNode *&Scope, const MDNode *&InlinedAt) {
-  if (MInsn->isDebugValue())
-    return false;
   DebugLoc DL = MInsn->getDebugLoc();
   if (DL.isUnknown()) return false;
       
@@ -2488,7 +2567,8 @@ bool DwarfDebug::extractScopeInformation() {
         // current instruction scope does not match scope of first instruction
         // in this range then create a new instruction range.
         DbgRange R(RangeBeginMI, PrevMI);
-        MI2ScopeMap[RangeBeginMI] = getOrCreateDbgScope(PrevScope, PrevInlinedAt);
+        MI2ScopeMap[RangeBeginMI] = getOrCreateDbgScope(PrevScope, 
+                                                        PrevInlinedAt);
         MIRanges.push_back(R);
       } 
 
@@ -2565,7 +2645,6 @@ void DwarfDebug::identifyScopeMarkers() {
            RE = Ranges.end(); RI != RE; ++RI) {
       assert(RI->first && "DbgRange does not have first instruction!");      
       assert(RI->second && "DbgRange does not have second instruction!");      
-      InsnsBeginScopeSet.insert(RI->first);
       InsnsEndScopeSet.insert(RI->second);
     }
   }
@@ -2616,6 +2695,9 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   
   recordSourceLine(Line, Col, Scope);
 
+  /// ProcessedArgs - Collection of arguments already processed.
+  SmallPtrSet<const MDNode *, 8> ProcessedArgs;
+
   DebugLoc PrevLoc;
   for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
        I != E; ++I)
@@ -2624,14 +2706,19 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
       const MachineInstr *MI = II;
       DebugLoc DL = MI->getDebugLoc();
       if (MI->isDebugValue()) {
-        // DBG_VALUE needs a label if the variable is local variable or
-        // an argument whose location is changing.
         assert (MI->getNumOperands() > 1 && "Invalid machine instruction!");
         DIVariable DV(MI->getOperand(MI->getNumOperands() - 1).getMetadata());
         if (!DV.Verify()) continue;
-        if (DV.getTag() != dwarf::DW_TAG_arg_variable)
+        // If DBG_VALUE is for a local variable then it needs a label.
+        if (DV.getTag() != dwarf::DW_TAG_arg_variable 
+            && isDbgValueInUndefinedReg(MI) == false)
           InsnNeedsLabel.insert(MI);
-        else if (!ProcessedArgs.insert(DV))
+        // DBG_VALUE for inlined functions argument needs a label.
+        else if (!DISubprogram(getDISubprogram(DV.getContext())).
+                 describes(MF->getFunction()))
+          InsnNeedsLabel.insert(MI);
+        // DBG_VALUE indicating argument location change needs a label.
+        else if (isDbgValueInUndefinedReg(MI) == false && !ProcessedArgs.insert(DV))
           InsnNeedsLabel.insert(MI);
       } else {
         // If location is unknown then instruction needs a location only if 
@@ -2664,7 +2751,8 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
     // Assumes in correct section after the entry point.
     Asm->OutStreamer.EmitLabel(FunctionEndSym);
     
-    collectVariableInfo(MF);
+    SmallPtrSet<const MDNode *, 16> ProcessedVars;
+    collectVariableInfo(MF, ProcessedVars);
 
     // Get function line info.
     if (!Lines.empty()) {
@@ -2679,9 +2767,31 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
     
     // Construct abstract scopes.
     for (SmallVector<DbgScope *, 4>::iterator AI = AbstractScopesList.begin(),
-           AE = AbstractScopesList.end(); AI != AE; ++AI)
-      constructScopeDIE(*AI);
-    
+           AE = AbstractScopesList.end(); AI != AE; ++AI) {
+      DISubprogram SP((*AI)->getScopeNode());
+      if (SP.Verify()) {
+        // Collect info for variables that were optimized out.
+        StringRef FName = SP.getLinkageName();
+        if (FName.empty())
+          FName = SP.getName();
+        const Module *M = MF->getFunction()->getParent();
+        if (NamedMDNode *NMD = 
+            M->getNamedMetadata(Twine("llvm.dbg.lv.", 
+                                      getRealLinkageName(FName)))) {
+          for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+          DIVariable DV(cast_or_null<MDNode>(NMD->getOperand(i)));
+          if (!DV || !ProcessedVars.insert(DV))
+            continue;
+          DbgScope *Scope = AbstractScopes.lookup(DV.getContext());
+          if (Scope)
+            Scope->addVariable(new DbgVariable(DV));
+          }
+        }
+      }
+      if (ProcessedSPNodes.count((*AI)->getScopeNode()) == 0)
+        constructScopeDIE(*AI);
+    }
+
     DIE *CurFnDIE = constructScopeDIE(CurrentFnDbgScope);
     
     if (!DisableFramePointerElim(*MF))
@@ -2696,13 +2806,11 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   // Clear debug info
   CurrentFnDbgScope = NULL;
   InsnNeedsLabel.clear();
-  ProcessedArgs.clear();
   DbgVariableToFrameIndexMap.clear();
   VarToAbstractVarMap.clear();
   DbgVariableToDbgInstMap.clear();
   DbgVariableLabelsMap.clear();
   DeleteContainerSeconds(DbgScopeMap);
-  InsnsBeginScopeSet.clear();
   InsnsEndScopeSet.clear();
   ConcreteScopes.clear();
   DeleteContainerSeconds(AbstractScopes);
@@ -2764,7 +2872,8 @@ DbgScope *DwarfDebug::findDbgScope(const MachineInstr *MInsn) {
 /// recordSourceLine - Register a source line with debug info. Returns the
 /// unique label that was emitted and which provides correspondence to
 /// the source line list.
-MCSymbol *DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S) {
+MCSymbol *DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, 
+                                       const MDNode *S) {
   StringRef Dir;
   StringRef Fn;
 
@@ -2790,6 +2899,16 @@ MCSymbol *DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode
     Src = GetOrCreateSourceID(Dir, Fn);
   }
 
+#if 0
+  if (!Lines.empty()) {
+    SrcLineInfo lastSrcLineInfo = Lines.back();
+    // Emitting sequential line records with the same line number (but
+    // different addresses) seems to confuse GDB.  Avoid this.
+    if (lastSrcLineInfo.getLine() == Line)
+      return NULL;
+  }
+#endif
+
   MCSymbol *Label = MMI->getContext().CreateTempSymbol();
   Lines.push_back(SrcLineInfo(Line, Col, Src, Label));
 
@@ -2898,7 +3017,8 @@ void DwarfDebug::EmitSectionLabels() {
   if (const MCSection *MacroInfo = TLOF.getDwarfMacroInfoSection())
     EmitSectionSym(Asm, MacroInfo);
 
-  EmitSectionSym(Asm, TLOF.getDwarfLineSection());
+  DwarfDebugLineSectionSym = 
+    EmitSectionSym(Asm, TLOF.getDwarfLineSection(), "section_line");
   EmitSectionSym(Asm, TLOF.getDwarfLocSection());
   EmitSectionSym(Asm, TLOF.getDwarfPubNamesSection());
   EmitSectionSym(Asm, TLOF.getDwarfPubTypesSection());
@@ -2961,6 +3081,11 @@ void DwarfDebug::emitDIE(DIE *Die) {
                                      4);
       break;
     }
+    case dwarf::DW_AT_stmt_list: {
+      Asm->EmitLabelDifference(CurrentLineSectionSym, 
+                               DwarfDebugLineSectionSym, 4);
+      break;
+    }
     case dwarf::DW_AT_location: {
       if (UseDotDebugLocEntry.count(Die) != 0) {
         DIELabel *L = cast<DIELabel>(Values[i]);
@@ -3106,6 +3231,8 @@ void DwarfDebug::emitDebugLines() {
                             Asm->getObjFileLowering().getDwarfLineSection());
 
   // Construct the section header.
+  CurrentLineSectionSym = Asm->GetTempSymbol("section_line_begin");
+  Asm->OutStreamer.EmitLabel(CurrentLineSectionSym);
   Asm->OutStreamer.AddComment("Length of Source Line Info");
   Asm->EmitLabelDifference(Asm->GetTempSymbol("line_end"),
                            Asm->GetTempSymbol("line_begin"), 4);
@@ -3491,8 +3618,9 @@ void DwarfDebug::emitDebugLoc() {
   unsigned char Size = Asm->getTargetData().getPointerSize();
   Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_loc", 0));
   unsigned index = 1;
-  for (SmallVector<DotDebugLocEntry, 4>::iterator I = DotDebugLocEntries.begin(),
-         E = DotDebugLocEntries.end(); I != E; ++I, ++index) {
+  for (SmallVector<DotDebugLocEntry, 4>::iterator 
+         I = DotDebugLocEntries.begin(), E = DotDebugLocEntries.end(); 
+       I != E; ++I, ++index) {
     DotDebugLocEntry Entry = *I;
     if (Entry.isEmpty()) {
       Asm->OutStreamer.EmitIntValue(0, Size, /*addrspace*/0);
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 0d6116fc9861..5a281c851748 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -156,6 +156,9 @@ class DwarfDebug {
   /// not included DbgScopeMap.  AbstractScopes owns its DbgScope*s.
   DenseMap<const MDNode *, DbgScope *> AbstractScopes;
 
+  /// AbstractSPDies - Collection of abstract subprogram DIEs.
+  DenseMap<const MDNode *, DIE *> AbstractSPDies;
+
   /// AbstractScopesList - Tracks abstract scopes constructed while processing
   /// a function. This list is cleared during endFunction().
   SmallVector<DbgScope *, 4>AbstractScopesList;
@@ -210,7 +213,7 @@ class DwarfDebug {
   DenseMap<DIE *, const MDNode *> ContainingTypeMap;
 
   typedef SmallVector<DbgScope *, 2> ScopeVector;
-  SmallPtrSet<const MachineInstr *, 8> InsnsBeginScopeSet;
+
   SmallPtrSet<const MachineInstr *, 8> InsnsEndScopeSet;
 
   /// InlineInfo - Keep track of inlined functions and their location.  This
@@ -219,6 +222,10 @@ class DwarfDebug {
   DenseMap<const MDNode *, SmallVector<InlineInfoLabels, 4> > InlineInfo;
   SmallVector<const MDNode *, 4> InlinedSPNodes;
 
+  // ProcessedSPNodes - This is a collection of subprogram MDNodes that
+  // are processed to create DIEs.
+  SmallPtrSet<const MDNode *, 16> ProcessedSPNodes;
+
   /// LabelsBeforeInsn - Maps instruction with label emitted before 
   /// instruction.
   DenseMap<const MachineInstr *, MCSymbol *> LabelsBeforeInsn;
@@ -231,9 +238,6 @@ class DwarfDebug {
   /// a debuggging information entity.
   SmallPtrSet<const MachineInstr *, 8> InsnNeedsLabel;
 
-  /// ProcessedArgs - Collection of arguments already processed.
-  SmallPtrSet<const MDNode *, 8> ProcessedArgs;
-
   SmallVector<const MCSymbol *, 8> DebugRangeSymbols;
 
   /// Previous instruction's location information. This is used to determine
@@ -257,7 +261,10 @@ class DwarfDebug {
   MCSymbol *DwarfFrameSectionSym, *DwarfInfoSectionSym, *DwarfAbbrevSectionSym;
   MCSymbol *DwarfStrSectionSym, *TextSectionSym, *DwarfDebugRangeSectionSym;
   MCSymbol *DwarfDebugLocSectionSym;
+  MCSymbol *DwarfDebugLineSectionSym, *CurrentLineSectionSym;
   MCSymbol *FunctionBeginSym, *FunctionEndSym;
+
+  DIEInteger *DIEIntegerOne;
 private:
   
   /// getSourceDirectoryAndFileIds - Return the directory and file ids that
@@ -593,7 +600,8 @@ private:
   bool extractScopeInformation();
   
   /// collectVariableInfo - Populate DbgScope entries with variables' info.
-  void collectVariableInfo(const MachineFunction *);
+  void collectVariableInfo(const MachineFunction *,
+                           SmallPtrSet<const MDNode *, 16> &ProcessedVars);
   
   /// collectVariableInfoFromMMITable - Collect variable information from
   /// side table maintained by MMI.
diff --git a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
index f92127f22748..c8a63cf2393b 100644
--- a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -52,13 +52,13 @@ static void EmitCamlGlobal(const Module &M, AsmPrinter &AP, const char *Id) {
   SymName.append(MId.begin(), std::find(MId.begin(), MId.end(), '.'));
   SymName += "__";
   SymName += Id;
-  
+
   // Capitalize the first letter of the module name.
   SymName[Letter] = toupper(SymName[Letter]);
-  
+
   SmallString<128> TmpStr;
   AP.Mang->getNameWithPrefix(TmpStr, SymName);
-  
+
   MCSymbol *Sym = AP.OutContext.GetOrCreateSymbol(TmpStr);
 
   AP.OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global);
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 9dec22ec78a3..7f98df0d22ea 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -358,23 +358,10 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
 }
 
 /// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything
-/// after it, replacing it with an unconditional branch to NewDest.  This
-/// returns true if OldInst's block is modified, false if NewDest is modified.
+/// after it, replacing it with an unconditional branch to NewDest.
 void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
                                            MachineBasicBlock *NewDest) {
-  MachineBasicBlock *OldBB = OldInst->getParent();
-
-  // Remove all the old successors of OldBB from the CFG.
-  while (!OldBB->succ_empty())
-    OldBB->removeSuccessor(OldBB->succ_begin());
-
-  // Remove all the dead instructions from the end of OldBB.
-  OldBB->erase(OldInst, OldBB->end());
-
-  // If OldBB isn't immediately before OldBB, insert a branch to it.
-  if (++MachineFunction::iterator(OldBB) != MachineFunction::iterator(NewDest))
-    TII->InsertBranch(*OldBB, NewDest, 0, SmallVector<MachineOperand, 0>());
-  OldBB->addSuccessor(NewDest);
+  TII->ReplaceTailWithBranchTo(OldInst, NewDest);
   ++NumTailMerge;
 }
 
@@ -383,6 +370,9 @@ void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
 /// iterator.  This returns the new MBB.
 MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
                                             MachineBasicBlock::iterator BBI1) {
+  if (!TII->isLegalToSplitMBBAt(CurMBB, BBI1))
+    return 0;
+
   MachineFunction &MF = *CurMBB.getParent();
 
   // Create the fall-through block.
@@ -443,18 +433,20 @@ static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB,
   MachineFunction::iterator I = llvm::next(MachineFunction::iterator(CurMBB));
   MachineBasicBlock *TBB = 0, *FBB = 0;
   SmallVector<MachineOperand, 4> Cond;
+  DebugLoc dl;  // FIXME: this is nowhere
   if (I != MF->end() &&
       !TII->AnalyzeBranch(*CurMBB, TBB, FBB, Cond, true)) {
     MachineBasicBlock *NextBB = I;
     if (TBB == NextBB && !Cond.empty() && !FBB) {
       if (!TII->ReverseBranchCondition(Cond)) {
         TII->RemoveBranch(*CurMBB);
-        TII->InsertBranch(*CurMBB, SuccBB, NULL, Cond);
+        TII->InsertBranch(*CurMBB, SuccBB, NULL, Cond, dl);
         return;
       }
     }
   }
-  TII->InsertBranch(*CurMBB, SuccBB, NULL, SmallVector<MachineOperand, 0>());
+  TII->InsertBranch(*CurMBB, SuccBB, NULL,
+                    SmallVector<MachineOperand, 0>(), dl);
 }
 
 bool
@@ -625,9 +617,10 @@ void BranchFolder::RemoveBlocksWithHash(unsigned CurHash,
 
 /// CreateCommonTailOnlyBlock - None of the blocks to be tail-merged consist
 /// only of the common tail.  Create a block that does by splitting one.
-unsigned BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
-                                             unsigned maxCommonTailLength) {
-  unsigned commonTailIndex = 0;
+bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
+                                             unsigned maxCommonTailLength,
+                                             unsigned &commonTailIndex) {
+  commonTailIndex = 0;
   unsigned TimeEstimate = ~0U;
   for (unsigned i = 0, e = SameTails.size(); i != e; ++i) {
     // Use PredBB if possible; that doesn't require a new branch.
@@ -655,6 +648,11 @@ unsigned BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
                << maxCommonTailLength);
 
   MachineBasicBlock *newMBB = SplitMBBAt(*MBB, BBI);
+  if (!newMBB) {
+    DEBUG(dbgs() << "... failed!");
+    return false;
+  }
+
   SameTails[commonTailIndex].setBlock(newMBB);
   SameTails[commonTailIndex].setTailStartPos(newMBB->begin());
 
@@ -662,7 +660,7 @@ unsigned BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
   if (PredBB == MBB)
     PredBB = newMBB;
 
-  return commonTailIndex;
+  return true;
 }
 
 // See if any of the blocks in MergePotentials (which all have a common single
@@ -757,7 +755,11 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
          !SameTails[commonTailIndex].tailIsWholeBlock())) {
       // None of the blocks consist entirely of the common tail.
       // Split a block so that one does.
-      commonTailIndex = CreateCommonTailOnlyBlock(PredBB, maxCommonTailLength);
+      if (!CreateCommonTailOnlyBlock(PredBB,
+                                     maxCommonTailLength, commonTailIndex)) {
+        RemoveBlocksWithHash(CurHash, SuccBB, PredBB);
+        continue;
+      }
     }
 
     MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
@@ -874,10 +876,11 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
           }
           // Remove the unconditional branch at the end, if any.
           if (TBB && (Cond.empty() || FBB)) {
+            DebugLoc dl;  // FIXME: this is nowhere
             TII->RemoveBranch(*PBB);
             if (!Cond.empty())
               // reinsert conditional branch only, for now
-              TII->InsertBranch(*PBB, (TBB == IBB) ? FBB : TBB, 0, NewCond);
+              TII->InsertBranch(*PBB, (TBB == IBB) ? FBB : TBB, 0, NewCond, dl);
           }
           MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(PBB), *P));
         }
@@ -976,6 +979,7 @@ static bool IsBetterFallthrough(MachineBasicBlock *MBB1,
 bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
   bool MadeChange = false;
   MachineFunction &MF = *MBB->getParent();
+  DebugLoc dl;  // FIXME: this is nowhere
 ReoptimizeBlock:
 
   MachineFunction::iterator FallThrough = MBB;
@@ -1027,7 +1031,7 @@ ReoptimizeBlock:
       TII->RemoveBranch(PrevBB);
       PriorCond.clear();
       if (PriorTBB != MBB)
-        TII->InsertBranch(PrevBB, PriorTBB, 0, PriorCond);
+        TII->InsertBranch(PrevBB, PriorTBB, 0, PriorCond, dl);
       MadeChange = true;
       ++NumBranchOpts;
       goto ReoptimizeBlock;
@@ -1066,7 +1070,7 @@ ReoptimizeBlock:
     // the condition is false, remove the uncond second branch.
     if (PriorFBB == MBB) {
       TII->RemoveBranch(PrevBB);
-      TII->InsertBranch(PrevBB, PriorTBB, 0, PriorCond);
+      TII->InsertBranch(PrevBB, PriorTBB, 0, PriorCond, dl);
       MadeChange = true;
       ++NumBranchOpts;
       goto ReoptimizeBlock;
@@ -1079,7 +1083,7 @@ ReoptimizeBlock:
       SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
       if (!TII->ReverseBranchCondition(NewPriorCond)) {
         TII->RemoveBranch(PrevBB);
-        TII->InsertBranch(PrevBB, PriorFBB, 0, NewPriorCond);
+        TII->InsertBranch(PrevBB, PriorFBB, 0, NewPriorCond, dl);
         MadeChange = true;
         ++NumBranchOpts;
         goto ReoptimizeBlock;
@@ -1116,7 +1120,7 @@ ReoptimizeBlock:
                        << "To make fallthrough to: " << *PriorTBB << "\n");
 
           TII->RemoveBranch(PrevBB);
-          TII->InsertBranch(PrevBB, MBB, 0, NewPriorCond);
+          TII->InsertBranch(PrevBB, MBB, 0, NewPriorCond, dl);
 
           // Move this block to the end of the function.
           MBB->moveAfter(--MF.end());
@@ -1145,7 +1149,7 @@ ReoptimizeBlock:
       SmallVector<MachineOperand, 4> NewCond(CurCond);
       if (!TII->ReverseBranchCondition(NewCond)) {
         TII->RemoveBranch(*MBB);
-        TII->InsertBranch(*MBB, CurFBB, CurTBB, NewCond);
+        TII->InsertBranch(*MBB, CurFBB, CurTBB, NewCond, dl);
         MadeChange = true;
         ++NumBranchOpts;
         goto ReoptimizeBlock;
@@ -1200,7 +1204,7 @@ ReoptimizeBlock:
               PriorFBB = MBB;
             }
             TII->RemoveBranch(PrevBB);
-            TII->InsertBranch(PrevBB, PriorTBB, PriorFBB, PriorCond);
+            TII->InsertBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, dl);
           }
 
           // Iterate through all the predecessors, revectoring each in-turn.
@@ -1226,7 +1230,7 @@ ReoptimizeBlock:
               if (!NewCurUnAnalyzable && NewCurTBB && NewCurTBB == NewCurFBB) {
                 TII->RemoveBranch(*PMBB);
                 NewCurCond.clear();
-                TII->InsertBranch(*PMBB, NewCurTBB, 0, NewCurCond);
+                TII->InsertBranch(*PMBB, NewCurTBB, 0, NewCurCond, dl);
                 MadeChange = true;
                 ++NumBranchOpts;
                 PMBB->CorrectExtraCFGEdges(NewCurTBB, 0, false);
@@ -1246,7 +1250,7 @@ ReoptimizeBlock:
       }
 
       // Add the branch back if the block is more than just an uncond branch.
-      TII->InsertBranch(*MBB, CurTBB, 0, CurCond);
+      TII->InsertBranch(*MBB, CurTBB, 0, CurCond, dl);
     }
   }
 
@@ -1286,7 +1290,7 @@ ReoptimizeBlock:
           if (CurFallsThru) {
             MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB));
             CurCond.clear();
-            TII->InsertBranch(*MBB, NextBB, 0, CurCond);
+            TII->InsertBranch(*MBB, NextBB, 0, CurCond, dl);
           }
           MBB->moveAfter(PredBB);
           MadeChange = true;
diff --git a/lib/CodeGen/BranchFolding.h b/lib/CodeGen/BranchFolding.h
index b08739564060..15dfa7f6bee5 100644
--- a/lib/CodeGen/BranchFolding.h
+++ b/lib/CodeGen/BranchFolding.h
@@ -102,8 +102,9 @@ namespace llvm {
                               MachineBasicBlock *PredBB);
     void RemoveBlocksWithHash(unsigned CurHash, MachineBasicBlock* SuccBB,
                                                 MachineBasicBlock* PredBB);
-    unsigned CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
-                                       unsigned maxCommonTailLength);
+    bool CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
+                                   unsigned maxCommonTailLength,
+                                   unsigned &commonTailIndex);
 
     bool OptimizeBranches(MachineFunction &MF);
     bool OptimizeBlock(MachineBasicBlock *MBB);
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 3e38872a36d6..ffeff1ee27a6 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -1,19 +1,20 @@
 add_llvm_library(LLVMCodeGen
-  Analysis.cpp
   AggressiveAntiDepBreaker.cpp
+  Analysis.cpp
   BranchFolding.cpp
   CalcSpillWeights.cpp
+  CallingConvLower.cpp
   CodePlacementOpt.cpp
   CriticalAntiDepBreaker.cpp
   DeadMachineInstructionElim.cpp
   DwarfEHPrepare.cpp
   ELFCodeEmitter.cpp
   ELFWriter.cpp
-  ExactHazardRecognizer.cpp
   GCMetadata.cpp
   GCMetadataPrinter.cpp
   GCStrategy.cpp
   IfConversion.cpp
+  InlineSpiller.cpp
   IntrinsicLowering.cpp
   LLVMTargetMachine.cpp
   LatencyPriorityQueue.cpp
@@ -45,6 +46,7 @@ add_llvm_library(LLVMCodeGen
   OptimizePHIs.cpp
   PHIElimination.cpp
   Passes.cpp
+  PostRAHazardRecognizer.cpp
   PostRASchedulerList.cpp
   PreAllocSplitting.cpp
   ProcessImplicitDefs.cpp
@@ -52,7 +54,6 @@ add_llvm_library(LLVMCodeGen
   PseudoSourceValue.cpp
   RegAllocFast.cpp
   RegAllocLinearScan.cpp
-  RegAllocLocal.cpp
   RegAllocPBQP.cpp
   RegisterCoalescer.cpp
   RegisterScavenging.cpp
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index a328d0e556e9..240a7b94fccf 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -116,7 +116,7 @@ bool CalculateSpillWeights::runOnMachineFunction(MachineFunction &fn) {
       SmallVector<LiveInterval*, 4> spillIs;
       if (lis->isReMaterializable(li, spillIs, isLoad)) {
         // If all of the definitions of the interval are re-materializable,
-        // it is a preferred candidate for spilling. If non of the defs are
+        // it is a preferred candidate for spilling. If none of the defs are
         // loads, then it's potentially very cheap to re-materialize.
         // FIXME: this gets much more complicated once we support non-trivial
         // re-materialization.
diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp
new file mode 100644
index 000000000000..62ad8171a9d4
--- /dev/null
+++ b/lib/CodeGen/CallingConvLower.cpp
@@ -0,0 +1,177 @@
+//===-- CallingConvLower.cpp - Calling Conventions ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CCState class, used for lowering and implementing
+// calling conventions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+CCState::CCState(CallingConv::ID CC, bool isVarArg, const TargetMachine &tm,
+                 SmallVector<CCValAssign, 16> &locs, LLVMContext &C)
+  : CallingConv(CC), IsVarArg(isVarArg), TM(tm),
+    TRI(*TM.getRegisterInfo()), Locs(locs), Context(C) {
+  // No stack is used.
+  StackOffset = 0;
+  
+  UsedRegs.resize((TRI.getNumRegs()+31)/32);
+}
+
+// HandleByVal - Allocate a stack slot large enough to pass an argument by
+// value. The size and alignment information of the argument is encoded in its
+// parameter attribute.
+void CCState::HandleByVal(unsigned ValNo, EVT ValVT,
+                          EVT LocVT, CCValAssign::LocInfo LocInfo,
+                          int MinSize, int MinAlign,
+                          ISD::ArgFlagsTy ArgFlags) {
+  unsigned Align = ArgFlags.getByValAlign();
+  unsigned Size  = ArgFlags.getByValSize();
+  if (MinSize > (int)Size)
+    Size = MinSize;
+  if (MinAlign > (int)Align)
+    Align = MinAlign;
+  unsigned Offset = AllocateStack(Size, Align);
+
+  addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+}
+
+/// MarkAllocated - Mark a register and all of its aliases as allocated.
+void CCState::MarkAllocated(unsigned Reg) {
+  UsedRegs[Reg/32] |= 1 << (Reg&31);
+  
+  if (const unsigned *RegAliases = TRI.getAliasSet(Reg))
+    for (; (Reg = *RegAliases); ++RegAliases)
+      UsedRegs[Reg/32] |= 1 << (Reg&31);
+}
+
+/// AnalyzeFormalArguments - Analyze an array of argument values,
+/// incorporating info about the formals into this state.
+void
+CCState::AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+                                CCAssignFn Fn) {
+  unsigned NumArgs = Ins.size();
+
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    EVT ArgVT = Ins[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Formal argument #" << i << " has unhandled type "
+             << ArgVT.getEVTString();
+#endif
+      llvm_unreachable(0);
+    }
+  }
+}
+
+/// CheckReturn - Analyze the return values of a function, returning true if
+/// the return can be performed without sret-demotion, and false otherwise.
+bool CCState::CheckReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                          CCAssignFn Fn) {
+  // Determine which register each value should be copied into.
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    EVT VT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this))
+      return false;
+  }
+  return true;
+}
+
+/// AnalyzeReturn - Analyze the returned values of a return,
+/// incorporating info about the result values into this state.
+void CCState::AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                            CCAssignFn Fn) {
+  // Determine which register each value should be copied into.
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    EVT VT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Return operand #" << i << " has unhandled type "
+             << VT.getEVTString();
+#endif
+      llvm_unreachable(0);
+    }
+  }
+}
+
+/// AnalyzeCallOperands - Analyze the outgoing arguments to a call,
+/// incorporating info about the passed values into this state.
+void CCState::AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  CCAssignFn Fn) {
+  unsigned NumOps = Outs.size();
+  for (unsigned i = 0; i != NumOps; ++i) {
+    EVT ArgVT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call operand #" << i << " has unhandled type "
+             << ArgVT.getEVTString();
+#endif
+      llvm_unreachable(0);
+    }
+  }
+}
+
+/// AnalyzeCallOperands - Same as above except it takes vectors of types
+/// and argument flags.
+void CCState::AnalyzeCallOperands(SmallVectorImpl<EVT> &ArgVTs,
+                                  SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
+                                  CCAssignFn Fn) {
+  unsigned NumOps = ArgVTs.size();
+  for (unsigned i = 0; i != NumOps; ++i) {
+    EVT ArgVT = ArgVTs[i];
+    ISD::ArgFlagsTy ArgFlags = Flags[i];
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call operand #" << i << " has unhandled type "
+             << ArgVT.getEVTString();
+#endif
+      llvm_unreachable(0);
+    }
+  }
+}
+
+/// AnalyzeCallResult - Analyze the return values of a call,
+/// incorporating info about the passed values into this state.
+void CCState::AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
+                                CCAssignFn Fn) {
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    EVT VT = Ins[i].VT;
+    ISD::ArgFlagsTy Flags = Ins[i].Flags;
+    if (Fn(i, VT, VT, CCValAssign::Full, Flags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call result #" << i << " has unhandled type "
+             << VT.getEVTString();
+#endif
+      llvm_unreachable(0);
+    }
+  }
+}
+
+/// AnalyzeCallResult - Same as above except it's specialized for calls which
+/// produce a single value.
+void CCState::AnalyzeCallResult(EVT VT, CCAssignFn Fn) {
+  if (Fn(0, VT, VT, CCValAssign::Full, ISD::ArgFlagsTy(), *this)) {
+#ifndef NDEBUG
+    dbgs() << "Call result has unhandled type "
+           << VT.getEVTString();
+#endif
+    llvm_unreachable(0);
+  }
+}
diff --git a/lib/CodeGen/CodePlacementOpt.cpp b/lib/CodeGen/CodePlacementOpt.cpp
index 3ff2a046d233..e0e315c6c677 100644
--- a/lib/CodeGen/CodePlacementOpt.cpp
+++ b/lib/CodeGen/CodePlacementOpt.cpp
@@ -178,6 +178,8 @@ bool CodePlacementOpt::EliminateUnconditionalJumpsToTop(MachineFunction &MF,
         continue;
 
       // Move the block.
+      DEBUG(dbgs() << "CGP: Moving blocks starting at BB#" << Pred->getNumber()
+                   << " to top of loop.\n");
       Changed = true;
 
       // Move it and all the blocks that can reach it via fallthrough edges
@@ -297,6 +299,8 @@ bool CodePlacementOpt::MoveDiscontiguousLoopBlocks(MachineFunction &MF,
         continue;
 
       // Move the block.
+      DEBUG(dbgs() << "CGP: Moving blocks starting at BB#" << BB->getNumber()
+                   << " to be contiguous with loop.\n");
       Changed = true;
 
       // Process this block and all loop blocks contiguous with it, to keep
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index fd957b12fc44..e3746a985644 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -29,6 +30,7 @@ CriticalAntiDepBreaker::
 CriticalAntiDepBreaker(MachineFunction& MFi) :
   AntiDepBreaker(), MF(MFi),
   MRI(MF.getRegInfo()),
+  TII(MF.getTarget().getInstrInfo()),
   TRI(MF.getTarget().getRegisterInfo()),
   AllocatableSet(TRI->getAllocatableSet(MF))
 {
@@ -71,25 +73,27 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
         DefIndices[AliasReg] = ~0u;
       }
     }
-  } else {
-    // In a non-return block, examine the live-in regs of all successors.
-    for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+  }
+
+  // In a non-return block, examine the live-in regs of all successors.
+  // Note a return block can have successors if the return instruction is
+  // predicated.
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
          SE = BB->succ_end(); SI != SE; ++SI)
-      for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
+    for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
            E = (*SI)->livein_end(); I != E; ++I) {
-        unsigned Reg = *I;
-        Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
-        KillIndices[Reg] = BB->size();
-        DefIndices[Reg] = ~0u;
-        // Repeat, for all aliases.
-        for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
-          unsigned AliasReg = *Alias;
-          Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
-          KillIndices[AliasReg] = BB->size();
-          DefIndices[AliasReg] = ~0u;
-        }
+      unsigned Reg = *I;
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      KillIndices[Reg] = BB->size();
+      DefIndices[Reg] = ~0u;
+      // Repeat, for all aliases.
+      for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
+        unsigned AliasReg = *Alias;
+        Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
+        KillIndices[AliasReg] = BB->size();
+        DefIndices[AliasReg] = ~0u;
       }
-  }
+    }
 
   // Mark live-out callee-saved registers. In a return block this is
   // all callee-saved registers. In non-return this is any
@@ -164,6 +168,26 @@ static const SDep *CriticalPathStep(const SUnit *SU) {
 }
 
 void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
+  // It's not safe to change register allocation for source operands of
+  // that have special allocation requirements. Also assume all registers
+  // used in a call must not be changed (ABI).
+  // FIXME: The issue with predicated instruction is more complex. We are being
+  // conservatively here because the kill markers cannot be trusted after
+  // if-conversion:
+  // %R6<def> = LDR %SP, %reg0, 92, pred:14, pred:%reg0; mem:LD4[FixedStack14]
+  // ...
+  // STR %R0, %R6<kill>, %reg0, 0, pred:0, pred:%CPSR; mem:ST4[%395]
+  // %R6<def> = LDR %SP, %reg0, 100, pred:0, pred:%CPSR; mem:LD4[FixedStack12]
+  // STR %R0, %R6<kill>, %reg0, 0, pred:14, pred:%reg0; mem:ST4[%396](align=8)
+  //
+  // The first R6 kill is not really a kill since it's killed by a predicated
+  // instruction which may not be executed. The second R6 def may or may not
+  // re-define R6 so it's not safe to change it since the last R6 use cannot be
+  // changed.
+  bool Special = MI->getDesc().isCall() ||
+    MI->getDesc().hasExtraSrcRegAllocReq() ||
+    TII->isPredicated(MI);
+
   // Scan the register operands for this instruction and update
   // Classes and RegRefs.
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
@@ -199,9 +223,7 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
     if (Classes[Reg] != reinterpret_cast<TargetRegisterClass *>(-1))
       RegRefs.insert(std::make_pair(Reg, &MO));
 
-    // It's not safe to change register allocation for source operands of
-    // that have special allocation requirements.
-    if (MO.isUse() && MI->getDesc().hasExtraSrcRegAllocReq()) {
+    if (MO.isUse() && Special) {
       if (KeepRegs.insert(Reg)) {
         for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
              *Subreg; ++Subreg)
@@ -216,38 +238,43 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
   // Update liveness.
   // Proceding upwards, registers that are defed but not used in this
   // instruction are now dead.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
-    if (Reg == 0) continue;
-    if (!MO.isDef()) continue;
-    // Ignore two-addr defs.
-    if (MI->isRegTiedToUseOperand(i)) continue;
-
-    DefIndices[Reg] = Count;
-    KillIndices[Reg] = ~0u;
-    assert(((KillIndices[Reg] == ~0u) !=
-            (DefIndices[Reg] == ~0u)) &&
-           "Kill and Def maps aren't consistent for Reg!");
-    KeepRegs.erase(Reg);
-    Classes[Reg] = 0;
-    RegRefs.erase(Reg);
-    // Repeat, for all subregs.
-    for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
-         *Subreg; ++Subreg) {
-      unsigned SubregReg = *Subreg;
-      DefIndices[SubregReg] = Count;
-      KillIndices[SubregReg] = ~0u;
-      KeepRegs.erase(SubregReg);
-      Classes[SubregReg] = 0;
-      RegRefs.erase(SubregReg);
-    }
-    // Conservatively mark super-registers as unusable.
-    for (const unsigned *Super = TRI->getSuperRegisters(Reg);
-         *Super; ++Super) {
-      unsigned SuperReg = *Super;
-      Classes[SuperReg] = reinterpret_cast<TargetRegisterClass *>(-1);
+
+  if (!TII->isPredicated(MI)) {
+    // Predicated defs are modeled as read + write, i.e. similar to two
+    // address updates.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0) continue;
+      if (!MO.isDef()) continue;
+      // Ignore two-addr defs.
+      if (MI->isRegTiedToUseOperand(i)) continue;
+
+      DefIndices[Reg] = Count;
+      KillIndices[Reg] = ~0u;
+      assert(((KillIndices[Reg] == ~0u) !=
+              (DefIndices[Reg] == ~0u)) &&
+             "Kill and Def maps aren't consistent for Reg!");
+      KeepRegs.erase(Reg);
+      Classes[Reg] = 0;
+      RegRefs.erase(Reg);
+      // Repeat, for all subregs.
+      for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
+           *Subreg; ++Subreg) {
+        unsigned SubregReg = *Subreg;
+        DefIndices[SubregReg] = Count;
+        KillIndices[SubregReg] = ~0u;
+        KeepRegs.erase(SubregReg);
+        Classes[SubregReg] = 0;
+        RegRefs.erase(SubregReg);
+      }
+      // Conservatively mark super-registers as unusable.
+      for (const unsigned *Super = TRI->getSuperRegisters(Reg);
+           *Super; ++Super) {
+        unsigned SuperReg = *Super;
+        Classes[SuperReg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      }
     }
   }
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
@@ -334,10 +361,15 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
   // so just duck out immediately if the block is empty.
   if (SUnits.empty()) return 0;
 
+  // Keep a map of the MachineInstr*'s back to the SUnit representing them.
+  // This is used for updating debug information.
+  DenseMap<MachineInstr*,const SUnit*> MISUnitMap;
+
   // Find the node at the bottom of the critical path.
   const SUnit *Max = 0;
   for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
     const SUnit *SU = &SUnits[i];
+    MISUnitMap[SU->getInstr()] = SU;
     if (!Max || SU->getDepth() + SU->Latency > Max->getDepth() + Max->Latency)
       Max = SU;
   }
@@ -473,7 +505,11 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
 
     PrescanInstruction(MI);
 
-    if (MI->getDesc().hasExtraDefRegAllocReq())
+    // If MI's defs have a special allocation requirement, don't allow
+    // any def registers to be changed. Also assume all registers
+    // defined in a call must not be changed (ABI).
+    if (MI->getDesc().isCall() || MI->getDesc().hasExtraDefRegAllocReq() ||
+        TII->isPredicated(MI))
       // If this instruction's defs have special allocation requirement, don't
       // break this anti-dependency.
       AntiDepReg = 0;
@@ -485,7 +521,7 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
         if (!MO.isReg()) continue;
         unsigned Reg = MO.getReg();
         if (Reg == 0) continue;
-        if (MO.isUse() && AntiDepReg == Reg) {
+        if (MO.isUse() && TRI->regsOverlap(AntiDepReg, Reg)) {
           AntiDepReg = 0;
           break;
         }
@@ -519,8 +555,22 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
                   std::multimap<unsigned, MachineOperand *>::iterator>
            Range = RegRefs.equal_range(AntiDepReg);
         for (std::multimap<unsigned, MachineOperand *>::iterator
-             Q = Range.first, QE = Range.second; Q != QE; ++Q)
+             Q = Range.first, QE = Range.second; Q != QE; ++Q) {
           Q->second->setReg(NewReg);
+          // If the SU for the instruction being updated has debug information
+          // related to the anti-dependency register, make sure to update that
+          // as well.
+          const SUnit *SU = MISUnitMap[Q->second->getParent()];
+          if (!SU) continue;
+          for (unsigned i = 0, e = SU->DbgInstrList.size() ; i < e ; ++i) {
+            MachineInstr *DI = SU->DbgInstrList[i];
+            assert (DI->getNumOperands()==3 && DI->getOperand(0).isReg() &&
+                    DI->getOperand(0).getReg()
+                    && "Non register dbg_value attached to SUnit!");
+            if (DI->getOperand(0).getReg() == AntiDepReg)
+              DI->getOperand(0).setReg(NewReg);
+          }
+        }
 
         // We just went back in time and modified history; the
         // liveness information for the anti-depenence reg is now
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.h b/lib/CodeGen/CriticalAntiDepBreaker.h
index cc42dd2b8e32..540630083bcc 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.h
+++ b/lib/CodeGen/CriticalAntiDepBreaker.h
@@ -22,15 +22,18 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include <map>
 
 namespace llvm {
+class TargetInstrInfo;
+class TargetRegisterInfo;
+
   class CriticalAntiDepBreaker : public AntiDepBreaker {
     MachineFunction& MF;
     MachineRegisterInfo &MRI;
+    const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
 
     /// AllocatableSet - The set of allocatable registers.
diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index f6739f434044..01b31b420931 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
@@ -87,10 +88,13 @@ namespace {
     /// CleanupSelectors - Any remaining eh.selector intrinsic calls which still
     /// use the ".llvm.eh.catch.all.value" call need to convert to using its
     /// initializer instead.
-    bool CleanupSelectors();
+    bool CleanupSelectors(SmallPtrSet<IntrinsicInst*, 32> &Sels);
+
+    bool HasCatchAllInSelector(IntrinsicInst *);
 
     /// FindAllCleanupSelectors - Find all eh.selector calls that are clean-ups.
-    void FindAllCleanupSelectors(SmallPtrSet<IntrinsicInst*, 32> &Sels);
+    void FindAllCleanupSelectors(SmallPtrSet<IntrinsicInst*, 32> &Sels,
+                                 SmallPtrSet<IntrinsicInst*, 32> &CatchAllSels);
 
     /// FindAllURoRInvokes - Find all URoR invokes in the function.
     void FindAllURoRInvokes(SmallPtrSet<InvokeInst*, 32> &URoRInvokes);
@@ -150,7 +154,7 @@ namespace {
         Changed = true;
       }
 
-      return false;
+      return Changed;
     }
 
   public:
@@ -186,25 +190,32 @@ FunctionPass *llvm::createDwarfEHPass(const TargetMachine *tm, bool fast) {
   return new DwarfEHPrepare(tm, fast);
 }
 
+/// HasCatchAllInSelector - Return true if the intrinsic instruction has a
+/// catch-all.
+bool DwarfEHPrepare::HasCatchAllInSelector(IntrinsicInst *II) {
+  if (!EHCatchAllValue) return false;
+
+  unsigned ArgIdx = II->getNumArgOperands() - 1;
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(II->getArgOperand(ArgIdx));
+  return GV == EHCatchAllValue;
+}
+
 /// FindAllCleanupSelectors - Find all eh.selector calls that are clean-ups.
 void DwarfEHPrepare::
-FindAllCleanupSelectors(SmallPtrSet<IntrinsicInst*, 32> &Sels) {
+FindAllCleanupSelectors(SmallPtrSet<IntrinsicInst*, 32> &Sels,
+                        SmallPtrSet<IntrinsicInst*, 32> &CatchAllSels) {
   for (Value::use_iterator
          I = SelectorIntrinsic->use_begin(),
          E = SelectorIntrinsic->use_end(); I != E; ++I) {
-    IntrinsicInst *SI = cast<IntrinsicInst>(I);
-    if (!SI || SI->getParent()->getParent() != F) continue;
-
-    unsigned NumOps = SI->getNumOperands();
-    if (NumOps > 4) continue;
-    bool IsCleanUp = (NumOps == 3);
+    IntrinsicInst *II = cast<IntrinsicInst>(I);
 
-    if (!IsCleanUp)
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(SI->getOperand(3)))
-        IsCleanUp = (CI->getZExtValue() == 0);
+    if (II->getParent()->getParent() != F)
+      continue;
 
-    if (IsCleanUp)
-      Sels.insert(SI);
+    if (!HasCatchAllInSelector(II))
+      Sels.insert(II);
+    else
+      CatchAllSels.insert(II);
   }
 }
 
@@ -222,7 +233,7 @@ FindAllURoRInvokes(SmallPtrSet<InvokeInst*, 32> &URoRInvokes) {
 /// CleanupSelectors - Any remaining eh.selector intrinsic calls which still use
 /// the ".llvm.eh.catch.all.value" call need to convert to using its
 /// initializer instead.
-bool DwarfEHPrepare::CleanupSelectors() {
+bool DwarfEHPrepare::CleanupSelectors(SmallPtrSet<IntrinsicInst*, 32> &Sels) {
   if (!EHCatchAllValue) return false;
 
   if (!SelectorIntrinsic) {
@@ -232,17 +243,15 @@ bool DwarfEHPrepare::CleanupSelectors() {
   }
 
   bool Changed = false;
-  for (Value::use_iterator
-         I = SelectorIntrinsic->use_begin(),
-         E = SelectorIntrinsic->use_end(); I != E; ++I) {
-    IntrinsicInst *Sel = dyn_cast<IntrinsicInst>(I);
-    if (!Sel || Sel->getParent()->getParent() != F) continue;
+  for (SmallPtrSet<IntrinsicInst*, 32>::iterator
+         I = Sels.begin(), E = Sels.end(); I != E; ++I) {
+    IntrinsicInst *Sel = *I;
 
     // Index of the ".llvm.eh.catch.all.value" variable.
-    unsigned OpIdx = Sel->getNumOperands() - 1;
-    GlobalVariable *GV = dyn_cast<GlobalVariable>(Sel->getOperand(OpIdx));
+    unsigned OpIdx = Sel->getNumArgOperands() - 1;
+    GlobalVariable *GV = dyn_cast<GlobalVariable>(Sel->getArgOperand(OpIdx));
     if (GV != EHCatchAllValue) continue;
-    Sel->setOperand(OpIdx, EHCatchAllValue->getInitializer());
+    Sel->setArgOperand(OpIdx, EHCatchAllValue->getInitializer());
     Changed = true;
   }
 
@@ -293,8 +302,6 @@ DwarfEHPrepare::FindSelectorAndURoR(Instruction *Inst, bool &URoRInvoke,
 /// function. This is a candidate to merge the selector associated with the URoR
 /// invoke with the one from the URoR's landing pad.
 bool DwarfEHPrepare::HandleURoRInvokes() {
-  if (!DT) return CleanupSelectors(); // We require DominatorTree information.
-
   if (!EHCatchAllValue) {
     EHCatchAllValue =
       F->getParent()->getNamedGlobal(".llvm.eh.catch.all.value");
@@ -307,14 +314,20 @@ bool DwarfEHPrepare::HandleURoRInvokes() {
     if (!SelectorIntrinsic) return false;
   }
 
+  SmallPtrSet<IntrinsicInst*, 32> Sels;
+  SmallPtrSet<IntrinsicInst*, 32> CatchAllSels;
+  FindAllCleanupSelectors(Sels, CatchAllSels);
+
+  if (!DT)
+    // We require DominatorTree information.
+    return CleanupSelectors(CatchAllSels);
+
   if (!URoR) {
     URoR = F->getParent()->getFunction("_Unwind_Resume_or_Rethrow");
-    if (!URoR) return CleanupSelectors();
+    if (!URoR) return CleanupSelectors(CatchAllSels);
   }
 
-  SmallPtrSet<IntrinsicInst*, 32> Sels;
   SmallPtrSet<InvokeInst*, 32> URoRInvokes;
-  FindAllCleanupSelectors(Sels);
   FindAllURoRInvokes(URoRInvokes);
 
   SmallPtrSet<IntrinsicInst*, 32> SelsToConvert;
@@ -340,7 +353,8 @@ bool DwarfEHPrepare::HandleURoRInvokes() {
     if (!ExceptionValueIntrinsic) {
       ExceptionValueIntrinsic =
         Intrinsic::getDeclaration(F->getParent(), Intrinsic::eh_exception);
-      if (!ExceptionValueIntrinsic) return CleanupSelectors();
+      if (!ExceptionValueIntrinsic)
+        return CleanupSelectors(CatchAllSels);
     }
 
     for (Value::use_iterator
@@ -360,21 +374,9 @@ bool DwarfEHPrepare::HandleURoRInvokes() {
         // an eh.selector intrinsic call. If the eh.selector is a 'clean-up', we
         // need to convert it to a 'catch-all'.
         for (SmallPtrSet<IntrinsicInst*, 8>::iterator
-               SI = SelCalls.begin(), SE = SelCalls.end(); SI != SE; ++SI) {
-          IntrinsicInst *II = *SI;
-          unsigned NumOps = II->getNumOperands();
-
-          if (NumOps <= 4) {
-            bool IsCleanUp = (NumOps == 3);
-
-            if (!IsCleanUp)
-              if (ConstantInt *CI = dyn_cast<ConstantInt>(II->getOperand(3)))
-                IsCleanUp = (CI->getZExtValue() == 0);
-
-            if (IsCleanUp)
-              SelsToConvert.insert(II);
-          }
-        }
+               SI = SelCalls.begin(), SE = SelCalls.end(); SI != SE; ++SI)
+          if (!HasCatchAllInSelector(*SI))
+              SelsToConvert.insert(*SI);
       }
     }
   }
@@ -388,12 +390,22 @@ bool DwarfEHPrepare::HandleURoRInvokes() {
            SI = SelsToConvert.begin(), SE = SelsToConvert.end();
          SI != SE; ++SI) {
       IntrinsicInst *II = *SI;
-      SmallVector<Value*, 8> Args;
 
       // Use the exception object pointer and the personality function
       // from the original selector.
-      Args.push_back(II->getOperand(1)); // Exception object pointer.
-      Args.push_back(II->getOperand(2)); // Personality function.
+      CallSite CS(II);
+      IntrinsicInst::op_iterator I = CS.arg_begin();
+      IntrinsicInst::op_iterator E = CS.arg_end();
+      IntrinsicInst::op_iterator B = prior(E);
+
+      // Exclude last argument if it is an integer.
+      if (isa<ConstantInt>(B)) E = B;
+
+      // Add exception object pointer (front).
+      // Add personality function (next).
+      // Add in any filter IDs (rest).
+      SmallVector<Value*, 8> Args(I, E);
+
       Args.push_back(EHCatchAllValue->getInitializer()); // Catch-all indicator.
 
       CallInst *NewSelector =
@@ -409,7 +421,7 @@ bool DwarfEHPrepare::HandleURoRInvokes() {
     }
   }
 
-  Changed |= CleanupSelectors();
+  Changed |= CleanupSelectors(CatchAllSels);
   return Changed;
 }
 
diff --git a/lib/CodeGen/ELFCodeEmitter.cpp b/lib/CodeGen/ELFCodeEmitter.cpp
index 8416d3bda930..36b0e6514b3a 100644
--- a/lib/CodeGen/ELFCodeEmitter.cpp
+++ b/lib/CodeGen/ELFCodeEmitter.cpp
@@ -90,7 +90,7 @@ bool ELFCodeEmitter::finishFunction(MachineFunction &MF) {
     for (std::vector<MachineRelocation>::iterator MRI = JTRelocations.begin(),
          MRE = JTRelocations.end(); MRI != MRE; ++MRI) {
       MachineRelocation &MR = *MRI;
-      unsigned MBBOffset = getMachineBasicBlockAddress(MR.getBasicBlock());
+      uintptr_t MBBOffset = getMachineBasicBlockAddress(MR.getBasicBlock());
       MR.setResultPointer((void*)MBBOffset);
       MR.setConstantVal(ES->SectionIdx);
       JTSection.addRelocation(MR);
diff --git a/lib/CodeGen/ExactHazardRecognizer.cpp b/lib/CodeGen/ExactHazardRecognizer.cpp
deleted file mode 100644
index af5f2892c2f0..000000000000
--- a/lib/CodeGen/ExactHazardRecognizer.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-//===----- ExactHazardRecognizer.cpp - hazard recognizer -------- ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This implements a hazard recognizer using the instructions itineraries
-// defined for the current target.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "post-RA-sched"
-#include "ExactHazardRecognizer.h"
-#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrItineraries.h"
-
-using namespace llvm;
-
-ExactHazardRecognizer::
-ExactHazardRecognizer(const InstrItineraryData &LItinData) :
-  ScheduleHazardRecognizer(), ItinData(LItinData) 
-{
-  // Determine the maximum depth of any itinerary. This determines the
-  // depth of the scoreboard. We always make the scoreboard at least 1
-  // cycle deep to avoid dealing with the boundary condition.
-  unsigned ScoreboardDepth = 1;
-  if (!ItinData.isEmpty()) {
-    for (unsigned idx = 0; ; ++idx) {
-      if (ItinData.isEndMarker(idx))
-        break;
-
-      const InstrStage *IS = ItinData.beginStage(idx);
-      const InstrStage *E = ItinData.endStage(idx);
-      unsigned ItinDepth = 0;
-      for (; IS != E; ++IS)
-        ItinDepth += IS->getCycles();
-
-      ScoreboardDepth = std::max(ScoreboardDepth, ItinDepth);
-    }
-  }
-
-  ReservedScoreboard.reset(ScoreboardDepth);
-  RequiredScoreboard.reset(ScoreboardDepth);
-
-  DEBUG(dbgs() << "Using exact hazard recognizer: ScoreboardDepth = " 
-               << ScoreboardDepth << '\n');
-}
-
-void ExactHazardRecognizer::Reset() {
-  RequiredScoreboard.reset();
-  ReservedScoreboard.reset();
-}
-
-void ExactHazardRecognizer::ScoreBoard::dump() const {
-  dbgs() << "Scoreboard:\n";
-
-  unsigned last = Depth - 1;
-  while ((last > 0) && ((*this)[last] == 0))
-    last--;
-
-  for (unsigned i = 0; i <= last; i++) {
-    unsigned FUs = (*this)[i];
-    dbgs() << "\t";
-    for (int j = 31; j >= 0; j--)
-      dbgs() << ((FUs & (1 << j)) ? '1' : '0');
-    dbgs() << '\n';
-  }
-}
-
-ExactHazardRecognizer::HazardType ExactHazardRecognizer::getHazardType(SUnit *SU) {
-  if (ItinData.isEmpty())
-    return NoHazard;
-
-  unsigned cycle = 0;
-
-  // Use the itinerary for the underlying instruction to check for
-  // free FU's in the scoreboard at the appropriate future cycles.
-  unsigned idx = SU->getInstr()->getDesc().getSchedClass();
-  for (const InstrStage *IS = ItinData.beginStage(idx),
-         *E = ItinData.endStage(idx); IS != E; ++IS) {
-    // We must find one of the stage's units free for every cycle the
-    // stage is occupied. FIXME it would be more accurate to find the
-    // same unit free in all the cycles.
-    for (unsigned int i = 0; i < IS->getCycles(); ++i) {
-      assert(((cycle + i) < RequiredScoreboard.getDepth()) &&
-             "Scoreboard depth exceeded!");
-
-      unsigned freeUnits = IS->getUnits();
-      switch (IS->getReservationKind()) {
-      default:
-       assert(0 && "Invalid FU reservation");
-      case InstrStage::Required:
-        // Required FUs conflict with both reserved and required ones
-        freeUnits &= ~ReservedScoreboard[cycle + i];
-        // FALLTHROUGH
-      case InstrStage::Reserved:
-        // Reserved FUs can conflict only with required ones.
-        freeUnits &= ~RequiredScoreboard[cycle + i];
-        break;
-      }
-
-      if (!freeUnits) {
-        DEBUG(dbgs() << "*** Hazard in cycle " << (cycle + i) << ", ");
-        DEBUG(dbgs() << "SU(" << SU->NodeNum << "): ");
-        DEBUG(SU->getInstr()->dump());
-        return Hazard;
-      }
-    }
-
-    // Advance the cycle to the next stage.
-    cycle += IS->getNextCycles();
-  }
-
-  return NoHazard;
-}
-
-void ExactHazardRecognizer::EmitInstruction(SUnit *SU) {
-  if (ItinData.isEmpty())
-    return;
-
-  unsigned cycle = 0;
-
-  // Use the itinerary for the underlying instruction to reserve FU's
-  // in the scoreboard at the appropriate future cycles.
-  unsigned idx = SU->getInstr()->getDesc().getSchedClass();
-  for (const InstrStage *IS = ItinData.beginStage(idx),
-         *E = ItinData.endStage(idx); IS != E; ++IS) {
-    // We must reserve one of the stage's units for every cycle the
-    // stage is occupied. FIXME it would be more accurate to reserve
-    // the same unit free in all the cycles.
-    for (unsigned int i = 0; i < IS->getCycles(); ++i) {
-      assert(((cycle + i) < RequiredScoreboard.getDepth()) &&
-             "Scoreboard depth exceeded!");
-
-      unsigned freeUnits = IS->getUnits();
-      switch (IS->getReservationKind()) {
-      default:
-       assert(0 && "Invalid FU reservation");
-      case InstrStage::Required:
-        // Required FUs conflict with both reserved and required ones
-        freeUnits &= ~ReservedScoreboard[cycle + i];
-        // FALLTHROUGH
-      case InstrStage::Reserved:
-        // Reserved FUs can conflict only with required ones.
-        freeUnits &= ~RequiredScoreboard[cycle + i];
-        break;
-      }
-
-      // reduce to a single unit
-      unsigned freeUnit = 0;
-      do {
-        freeUnit = freeUnits;
-        freeUnits = freeUnit & (freeUnit - 1);
-      } while (freeUnits);
-
-      assert(freeUnit && "No function unit available!");
-      if (IS->getReservationKind() == InstrStage::Required)
-        RequiredScoreboard[cycle + i] |= freeUnit;
-      else
-        ReservedScoreboard[cycle + i] |= freeUnit;
-    }
-
-    // Advance the cycle to the next stage.
-    cycle += IS->getNextCycles();
-  }
-
-  DEBUG(ReservedScoreboard.dump());
-  DEBUG(RequiredScoreboard.dump());
-}
-
-void ExactHazardRecognizer::AdvanceCycle() {
-  ReservedScoreboard[0] = 0; ReservedScoreboard.advance();
-  RequiredScoreboard[0] = 0; RequiredScoreboard.advance();
-}
diff --git a/lib/CodeGen/ExactHazardRecognizer.h b/lib/CodeGen/ExactHazardRecognizer.h
deleted file mode 100644
index 91c81a970fa5..000000000000
--- a/lib/CodeGen/ExactHazardRecognizer.h
+++ /dev/null
@@ -1,86 +0,0 @@
-//=- llvm/CodeGen/ExactHazardRecognizer.h - Scheduling Support -*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ExactHazardRecognizer class, which
-// implements hazard-avoidance heuristics for scheduling, based on the
-// scheduling itineraries specified for the target.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_EXACTHAZARDRECOGNIZER_H
-#define LLVM_CODEGEN_EXACTHAZARDRECOGNIZER_H
-
-#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
-#include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Target/TargetInstrItineraries.h"
-
-namespace llvm {
-  class ExactHazardRecognizer : public ScheduleHazardRecognizer {
-    // ScoreBoard to track function unit usage. ScoreBoard[0] is a
-    // mask of the FUs in use in the cycle currently being
-    // schedule. ScoreBoard[1] is a mask for the next cycle. The
-    // ScoreBoard is used as a circular buffer with the current cycle
-    // indicated by Head.
-    class ScoreBoard {
-      unsigned *Data;
-
-      // The maximum number of cycles monitored by the Scoreboard. This
-      // value is determined based on the target itineraries to ensure
-      // that all hazards can be tracked.
-      size_t Depth;
-      // Indices into the Scoreboard that represent the current cycle.
-      size_t Head;
-    public:
-      ScoreBoard():Data(NULL), Depth(0), Head(0) { }
-      ~ScoreBoard() {
-        delete[] Data;
-      }
-
-      size_t getDepth() const { return Depth; }
-      unsigned& operator[](size_t idx) const {
-        assert(Depth && "ScoreBoard was not initialized properly!");
-
-        return Data[(Head + idx) % Depth];
-      }
-
-      void reset(size_t d = 1) {
-        if (Data == NULL) {
-          Depth = d;
-          Data = new unsigned[Depth];
-        }
-
-        memset(Data, 0, Depth * sizeof(Data[0]));
-        Head = 0;
-      }
-
-      void advance() {
-        Head = (Head + 1) % Depth;
-      }
-
-      // Print the scoreboard.
-      void dump() const;
-    };
-
-    // Itinerary data for the target.
-    const InstrItineraryData &ItinData;
-
-    ScoreBoard ReservedScoreboard;
-    ScoreBoard RequiredScoreboard;
-
-  public:
-    ExactHazardRecognizer(const InstrItineraryData &ItinData);
-
-    virtual HazardType getHazardType(SUnit *SU);
-    virtual void Reset();
-    virtual void EmitInstruction(SUnit *SU);
-    virtual void AdvanceCycle();
-  };
-}
-
-#endif
diff --git a/lib/CodeGen/GCStrategy.cpp b/lib/CodeGen/GCStrategy.cpp
index 790cb2164897..71506cc6abb9 100644
--- a/lib/CodeGen/GCStrategy.cpp
+++ b/lib/CodeGen/GCStrategy.cpp
@@ -271,7 +271,7 @@ bool LowerIntrinsics::PerformDefaultLowering(Function &F, GCStrategy &S) {
         case Intrinsic::gcwrite:
           if (LowerWr) {
             // Replace a write barrier with a simple store.
-            Value *St = new StoreInst(CI->getOperand(1), CI->getOperand(3), CI);
+            Value *St = new StoreInst(CI->getArgOperand(0), CI->getArgOperand(2), CI);
             CI->replaceAllUsesWith(St);
             CI->eraseFromParent();
           }
@@ -279,7 +279,7 @@ bool LowerIntrinsics::PerformDefaultLowering(Function &F, GCStrategy &S) {
         case Intrinsic::gcread:
           if (LowerRd) {
             // Replace a read barrier with a simple load.
-            Value *Ld = new LoadInst(CI->getOperand(2), "", CI);
+            Value *Ld = new LoadInst(CI->getArgOperand(1), "", CI);
             Ld->takeName(CI);
             CI->replaceAllUsesWith(Ld);
             CI->eraseFromParent();
@@ -290,7 +290,7 @@ bool LowerIntrinsics::PerformDefaultLowering(Function &F, GCStrategy &S) {
             // Initialize the GC root, but do not delete the intrinsic. The
             // backend needs the intrinsic to flag the stack slot.
             Roots.push_back(cast<AllocaInst>(
-                              CI->getOperand(1)->stripPointerCasts()));
+                              CI->getArgOperand(0)->stripPointerCasts()));
           }
           break;
         default:
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index c61fd17e7911..6b445e0b8e0f 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -33,20 +34,22 @@ using namespace llvm;
 static cl::opt<int> IfCvtFnStart("ifcvt-fn-start", cl::init(-1), cl::Hidden);
 static cl::opt<int> IfCvtFnStop("ifcvt-fn-stop", cl::init(-1), cl::Hidden);
 static cl::opt<int> IfCvtLimit("ifcvt-limit", cl::init(-1), cl::Hidden);
-static cl::opt<bool> DisableSimple("disable-ifcvt-simple", 
+static cl::opt<bool> DisableSimple("disable-ifcvt-simple",
                                    cl::init(false), cl::Hidden);
-static cl::opt<bool> DisableSimpleF("disable-ifcvt-simple-false", 
+static cl::opt<bool> DisableSimpleF("disable-ifcvt-simple-false",
                                     cl::init(false), cl::Hidden);
-static cl::opt<bool> DisableTriangle("disable-ifcvt-triangle", 
+static cl::opt<bool> DisableTriangle("disable-ifcvt-triangle",
                                      cl::init(false), cl::Hidden);
-static cl::opt<bool> DisableTriangleR("disable-ifcvt-triangle-rev", 
+static cl::opt<bool> DisableTriangleR("disable-ifcvt-triangle-rev",
                                       cl::init(false), cl::Hidden);
-static cl::opt<bool> DisableTriangleF("disable-ifcvt-triangle-false", 
+static cl::opt<bool> DisableTriangleF("disable-ifcvt-triangle-false",
                                       cl::init(false), cl::Hidden);
-static cl::opt<bool> DisableTriangleFR("disable-ifcvt-triangle-false-rev", 
+static cl::opt<bool> DisableTriangleFR("disable-ifcvt-triangle-false-rev",
                                        cl::init(false), cl::Hidden);
-static cl::opt<bool> DisableDiamond("disable-ifcvt-diamond", 
+static cl::opt<bool> DisableDiamond("disable-ifcvt-diamond",
                                     cl::init(false), cl::Hidden);
+static cl::opt<bool> IfCvtBranchFold("ifcvt-branch-fold",
+                                     cl::init(true), cl::Hidden);
 
 STATISTIC(NumSimple,       "Number of simple if-conversions performed");
 STATISTIC(NumSimpleFalse,  "Number of simple (F) if-conversions performed");
@@ -115,7 +118,7 @@ namespace {
                  BB(0), TrueBB(0), FalseBB(0) {}
     };
 
-    /// IfcvtToken - Record information about pending if-conversions to attemp:
+    /// IfcvtToken - Record information about pending if-conversions to attempt:
     /// BBI             - Corresponding BBInfo.
     /// Kind            - Type of block. See IfcvtKind.
     /// NeedSubsumption - True if the to-be-predicated BB has already been
@@ -146,6 +149,7 @@ namespace {
 
     const TargetLowering *TLI;
     const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
     bool MadeChange;
     int FnNum;
   public:
@@ -167,8 +171,7 @@ namespace {
                          std::vector<IfcvtToken*> &Tokens);
     bool FeasibilityAnalysis(BBInfo &BBI, SmallVectorImpl<MachineOperand> &Cond,
                              bool isTriangle = false, bool RevBranch = false);
-    bool AnalyzeBlocks(MachineFunction &MF,
-                       std::vector<IfcvtToken*> &Tokens);
+    void AnalyzeBlocks(MachineFunction &MF, std::vector<IfcvtToken*> &Tokens);
     void InvalidatePreds(MachineBasicBlock *BB);
     void RemoveExtraEdges(BBInfo &BBI);
     bool IfConvertSimple(BBInfo &BBI, IfcvtKind Kind);
@@ -177,14 +180,22 @@ namespace {
                           unsigned NumDups1, unsigned NumDups2);
     void PredicateBlock(BBInfo &BBI,
                         MachineBasicBlock::iterator E,
-                        SmallVectorImpl<MachineOperand> &Cond);
+                        SmallVectorImpl<MachineOperand> &Cond,
+                        SmallSet<unsigned, 4> &Redefs);
     void CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
                                SmallVectorImpl<MachineOperand> &Cond,
+                               SmallSet<unsigned, 4> &Redefs,
                                bool IgnoreBr = false);
-    void MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI);
+    void MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges = true);
 
-    bool MeetIfcvtSizeLimit(unsigned Size) const {
-      return Size > 0 && Size <= TLI->getIfCvtBlockSizeLimit();
+    bool MeetIfcvtSizeLimit(MachineBasicBlock &BB, unsigned Size) const {
+      return Size > 0 && TII->isProfitableToIfCvt(BB, Size);
+    }
+
+    bool MeetIfcvtSizeLimit(MachineBasicBlock &TBB, unsigned TSize,
+                            MachineBasicBlock &FBB, unsigned FSize) const {
+      return TSize > 0 && FSize > 0 &&
+        TII->isProfitableToIfCvt(TBB, TSize, FBB, FSize);
     }
 
     // blockAlwaysFallThrough - Block ends without a terminator.
@@ -227,8 +238,15 @@ FunctionPass *llvm::createIfConverterPass() { return new IfConverter(); }
 bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   TLI = MF.getTarget().getTargetLowering();
   TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
   if (!TII) return false;
 
+  // Tail merge tend to expose more if-conversion opportunities.
+  BranchFolder BF(true);
+  bool BFChange = BF.OptimizeFunction(MF, TII,
+                                   MF.getTarget().getRegisterInfo(),
+                                   getAnalysisIfAvailable<MachineModuleInfo>());
+
   DEBUG(dbgs() << "\nIfcvt: function (" << ++FnNum <<  ") \'"
                << MF.getFunction()->getName() << "\'");
 
@@ -253,7 +271,8 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   while (IfCvtLimit == -1 || (int)NumIfCvts < IfCvtLimit) {
     // Do an initial analysis for each basic block and find all the potential
     // candidates to perform if-conversion.
-    bool Change = AnalyzeBlocks(MF, Tokens);
+    bool Change = false;
+    AnalyzeBlocks(MF, Tokens);
     while (!Tokens.empty()) {
       IfcvtToken *Token = Tokens.back();
       Tokens.pop_back();
@@ -281,7 +300,8 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
       case ICSimpleFalse: {
         bool isFalse = Kind == ICSimpleFalse;
         if ((isFalse && DisableSimpleF) || (!isFalse && DisableSimple)) break;
-        DEBUG(dbgs() << "Ifcvt (Simple" << (Kind == ICSimpleFalse ? " false" :"")
+        DEBUG(dbgs() << "Ifcvt (Simple" << (Kind == ICSimpleFalse ?
+                                            " false" : "")
                      << "): BB#" << BBI.BB->getNumber() << " ("
                      << ((Kind == ICSimpleFalse)
                          ? BBI.FalseBB->getNumber()
@@ -289,8 +309,8 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
         RetVal = IfConvertSimple(BBI, Kind);
         DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
         if (RetVal) {
-          if (isFalse) NumSimpleFalse++;
-          else         NumSimple++;
+          if (isFalse) ++NumSimpleFalse;
+          else         ++NumSimple;
         }
        break;
       }
@@ -316,11 +336,11 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
         DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
         if (RetVal) {
           if (isFalse) {
-            if (isRev) NumTriangleFRev++;
-            else       NumTriangleFalse++;
+            if (isRev) ++NumTriangleFRev;
+            else       ++NumTriangleFalse;
           } else {
-            if (isRev) NumTriangleRev++;
-            else       NumTriangle++;
+            if (isRev) ++NumTriangleRev;
+            else       ++NumTriangle;
           }
         }
         break;
@@ -332,7 +352,7 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
                      << BBI.FalseBB->getNumber() << ") ");
         RetVal = IfConvertDiamond(BBI, Kind, NumDups, NumDups2);
         DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
-        if (RetVal) NumDiamonds++;
+        if (RetVal) ++NumDiamonds;
         break;
       }
       }
@@ -361,13 +381,14 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   Roots.clear();
   BBAnalysis.clear();
 
-  if (MadeChange) {
+  if (MadeChange && IfCvtBranchFold) {
     BranchFolder BF(false);
     BF.OptimizeFunction(MF, TII,
                         MF.getTarget().getRegisterInfo(),
                         getAnalysisIfAvailable<MachineModuleInfo>());
   }
 
+  MadeChange |= BFChange;
   return MadeChange;
 }
 
@@ -387,9 +408,10 @@ static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
 /// ReverseBranchCondition - Reverse the condition of the end of the block
 /// branch. Swap block's 'true' and 'false' successors.
 bool IfConverter::ReverseBranchCondition(BBInfo &BBI) {
+  DebugLoc dl;  // FIXME: this is nowhere
   if (!TII->ReverseBranchCondition(BBI.BrCond)) {
     TII->RemoveBranch(*BBI.BB);
-    TII->InsertBranch(*BBI.BB, BBI.FalseBB, BBI.TrueBB, BBI.BrCond);
+    TII->InsertBranch(*BBI.BB, BBI.FalseBB, BBI.TrueBB, BBI.BrCond, dl);
     std::swap(BBI.TrueBB, BBI.FalseBB);
     return true;
   }
@@ -420,7 +442,7 @@ bool IfConverter::ValidSimple(BBInfo &TrueBBI, unsigned &Dups) const {
 
   if (TrueBBI.BB->pred_size() > 1) {
     if (TrueBBI.CannotBeCopied ||
-        TrueBBI.NonPredSize > TLI->getIfCvtDupBlockSizeLimit())
+        !TII->isProfitableToDupForIfCvt(*TrueBBI.BB, TrueBBI.NonPredSize))
       return false;
     Dups = TrueBBI.NonPredSize;
   }
@@ -431,7 +453,7 @@ bool IfConverter::ValidSimple(BBInfo &TrueBBI, unsigned &Dups) const {
 /// ValidTriangle - Returns true if the 'true' and 'false' blocks (along
 /// with their common predecessor) forms a valid triangle shape for ifcvt.
 /// If 'FalseBranch' is true, it checks if 'true' block's false branch
-/// branches to the false branch rather than the other way around. It also
+/// branches to the 'false' block rather than the other way around. It also
 /// returns the number of instructions that the ifcvt would need to duplicate
 /// if performed in 'Dups'.
 bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,
@@ -457,7 +479,7 @@ bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,
           ++Size;
       }
     }
-    if (Size > TLI->getIfCvtDupBlockSizeLimit())
+    if (!TII->isProfitableToDupForIfCvt(*TrueBBI.BB, Size))
       return false;
     Dups = Size;
   }
@@ -514,7 +536,27 @@ bool IfConverter::ValidDiamond(BBInfo &TrueBBI, BBInfo &FalseBBI,
 
   MachineBasicBlock::iterator TI = TrueBBI.BB->begin();
   MachineBasicBlock::iterator FI = FalseBBI.BB->begin();
-  while (TI != TrueBBI.BB->end() && FI != FalseBBI.BB->end()) {
+  MachineBasicBlock::iterator TIE = TrueBBI.BB->end();
+  MachineBasicBlock::iterator FIE = FalseBBI.BB->end();
+  // Skip dbg_value instructions
+  while (TI != TIE && TI->isDebugValue())
+    ++TI;
+  while (FI != FIE && FI->isDebugValue())
+    ++FI;
+  while (TI != TIE && FI != FIE) {
+    // Skip dbg_value instructions. These do not count.
+    if (TI->isDebugValue()) {
+      while (TI != TIE && TI->isDebugValue())
+        ++TI;
+      if (TI == TIE)
+        break;
+    }
+    if (FI->isDebugValue()) {
+      while (FI != FIE && FI->isDebugValue())
+        ++FI;
+      if (FI == FIE)
+        break;
+    }
     if (!TI->isIdenticalTo(FI))
       break;
     ++Dups1;
@@ -524,7 +566,27 @@ bool IfConverter::ValidDiamond(BBInfo &TrueBBI, BBInfo &FalseBBI,
 
   TI = firstNonBranchInst(TrueBBI.BB, TII);
   FI = firstNonBranchInst(FalseBBI.BB, TII);
-  while (TI != TrueBBI.BB->begin() && FI != FalseBBI.BB->begin()) {
+  MachineBasicBlock::iterator TIB = TrueBBI.BB->begin();
+  MachineBasicBlock::iterator FIB = FalseBBI.BB->begin();
+  // Skip dbg_value instructions at end of the bb's.
+  while (TI != TIB && TI->isDebugValue())
+    --TI;
+  while (FI != FIB && FI->isDebugValue())
+    --FI;
+  while (TI != TIB && FI != FIB) {
+    // Skip dbg_value instructions. These do not count.
+    if (TI->isDebugValue()) {
+      while (TI != TIB && TI->isDebugValue())
+        --TI;
+      if (TI == TIB)
+        break;
+    }
+    if (FI->isDebugValue()) {
+      while (FI != FIB && FI->isDebugValue())
+        --FI;
+      if (FI == FIB)
+        break;
+    }
     if (!TI->isIdenticalTo(FI))
       break;
     ++Dups2;
@@ -556,7 +618,7 @@ void IfConverter::ScanInstructions(BBInfo &BBI) {
     // No false branch. This BB must end with a conditional branch and a
     // fallthrough.
     if (!BBI.FalseBB)
-      BBI.FalseBB = findFalseBlock(BBI.BB, BBI.TrueBB);  
+      BBI.FalseBB = findFalseBlock(BBI.BB, BBI.TrueBB);
     if (!BBI.FalseBB) {
       // Malformed bcc? True and false blocks are the same?
       BBI.IsUnpredicable = true;
@@ -569,6 +631,9 @@ void IfConverter::ScanInstructions(BBInfo &BBI) {
   BBI.ClobbersPred = false;
   for (MachineBasicBlock::iterator I = BBI.BB->begin(), E = BBI.BB->end();
        I != E; ++I) {
+    if (I->isDebugValue())
+      continue;
+
     const TargetInstrDesc &TID = I->getDesc();
     if (TID.isNotDuplicable())
       BBI.CannotBeCopied = true;
@@ -702,8 +767,8 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB,
   bool FNeedSub = FalseBBI.Predicate.size() > 0;
   bool Enqueued = false;
   if (CanRevCond && ValidDiamond(TrueBBI, FalseBBI, Dups, Dups2) &&
-      MeetIfcvtSizeLimit(TrueBBI.NonPredSize - (Dups + Dups2)) &&
-      MeetIfcvtSizeLimit(FalseBBI.NonPredSize - (Dups + Dups2)) &&
+      MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize - (Dups + Dups2),
+                         *FalseBBI.BB, FalseBBI.NonPredSize - (Dups + Dups2)) &&
       FeasibilityAnalysis(TrueBBI, BBI.BrCond) &&
       FeasibilityAnalysis(FalseBBI, RevCond)) {
     // Diamond:
@@ -720,7 +785,7 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB,
   }
 
   if (ValidTriangle(TrueBBI, FalseBBI, false, Dups) &&
-      MeetIfcvtSizeLimit(TrueBBI.NonPredSize) &&
+      MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize) &&
       FeasibilityAnalysis(TrueBBI, BBI.BrCond, true)) {
     // Triangle:
     //   EBB
@@ -732,23 +797,23 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB,
     Tokens.push_back(new IfcvtToken(BBI, ICTriangle, TNeedSub, Dups));
     Enqueued = true;
   }
-  
+
   if (ValidTriangle(TrueBBI, FalseBBI, true, Dups) &&
-      MeetIfcvtSizeLimit(TrueBBI.NonPredSize) &&
+      MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize) &&
       FeasibilityAnalysis(TrueBBI, BBI.BrCond, true, true)) {
     Tokens.push_back(new IfcvtToken(BBI, ICTriangleRev, TNeedSub, Dups));
     Enqueued = true;
   }
 
   if (ValidSimple(TrueBBI, Dups) &&
-      MeetIfcvtSizeLimit(TrueBBI.NonPredSize) &&
+      MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize) &&
       FeasibilityAnalysis(TrueBBI, BBI.BrCond)) {
     // Simple (split, no rejoin):
     //   EBB
     //   | \_
     //   |  |
     //   | TBB---> exit
-    //   |    
+    //   |
     //   FBB
     Tokens.push_back(new IfcvtToken(BBI, ICSimple, TNeedSub, Dups));
     Enqueued = true;
@@ -757,21 +822,21 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB,
   if (CanRevCond) {
     // Try the other path...
     if (ValidTriangle(FalseBBI, TrueBBI, false, Dups) &&
-        MeetIfcvtSizeLimit(FalseBBI.NonPredSize) &&
+        MeetIfcvtSizeLimit(*FalseBBI.BB, FalseBBI.NonPredSize) &&
         FeasibilityAnalysis(FalseBBI, RevCond, true)) {
       Tokens.push_back(new IfcvtToken(BBI, ICTriangleFalse, FNeedSub, Dups));
       Enqueued = true;
     }
 
     if (ValidTriangle(FalseBBI, TrueBBI, true, Dups) &&
-        MeetIfcvtSizeLimit(FalseBBI.NonPredSize) &&
+        MeetIfcvtSizeLimit(*FalseBBI.BB, FalseBBI.NonPredSize) &&
         FeasibilityAnalysis(FalseBBI, RevCond, true, true)) {
       Tokens.push_back(new IfcvtToken(BBI, ICTriangleFRev, FNeedSub, Dups));
       Enqueued = true;
     }
 
     if (ValidSimple(FalseBBI, Dups) &&
-        MeetIfcvtSizeLimit(FalseBBI.NonPredSize) &&
+        MeetIfcvtSizeLimit(*FalseBBI.BB, FalseBBI.NonPredSize) &&
         FeasibilityAnalysis(FalseBBI, RevCond)) {
       Tokens.push_back(new IfcvtToken(BBI, ICSimpleFalse, FNeedSub, Dups));
       Enqueued = true;
@@ -785,11 +850,9 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB,
 }
 
 /// AnalyzeBlocks - Analyze all blocks and find entries for all if-conversion
-/// candidates. It returns true if any CFG restructuring is done to expose more
-/// if-conversion opportunities.
-bool IfConverter::AnalyzeBlocks(MachineFunction &MF,
+/// candidates.
+void IfConverter::AnalyzeBlocks(MachineFunction &MF,
                                 std::vector<IfcvtToken*> &Tokens) {
-  bool Change = false;
   std::set<MachineBasicBlock*> Visited;
   for (unsigned i = 0, e = Roots.size(); i != e; ++i) {
     for (idf_ext_iterator<MachineBasicBlock*> I=idf_ext_begin(Roots[i],Visited),
@@ -801,20 +864,23 @@ bool IfConverter::AnalyzeBlocks(MachineFunction &MF,
 
   // Sort to favor more complex ifcvt scheme.
   std::stable_sort(Tokens.begin(), Tokens.end(), IfcvtTokenCmp);
-
-  return Change;
 }
 
 /// canFallThroughTo - Returns true either if ToBB is the next block after BB or
 /// that all the intervening blocks are empty (given BB can fall through to its
 /// next block).
 static bool canFallThroughTo(MachineBasicBlock *BB, MachineBasicBlock *ToBB) {
-  MachineFunction::iterator I = BB;
+  MachineFunction::iterator PI = BB;
+  MachineFunction::iterator I = llvm::next(PI);
   MachineFunction::iterator TI = ToBB;
   MachineFunction::iterator E = BB->getParent()->end();
-  while (++I != TI)
-    if (I == E || !I->empty())
+  while (I != TI) {
+    // Check isSuccessor to avoid case where the next block is empty, but
+    // it's not a successor.
+    if (I == E || !I->empty() || !PI->isSuccessor(I))
       return false;
+    PI = I++;
+  }
   return true;
 }
 
@@ -836,8 +902,9 @@ void IfConverter::InvalidatePreds(MachineBasicBlock *BB) {
 ///
 static void InsertUncondBranch(MachineBasicBlock *BB, MachineBasicBlock *ToBB,
                                const TargetInstrInfo *TII) {
+  DebugLoc dl;  // FIXME: this is nowhere
   SmallVector<MachineOperand, 0> NoCond;
-  TII->InsertBranch(*BB, ToBB, NULL, NoCond);
+  TII->InsertBranch(*BB, ToBB, NULL, NoCond, dl);
 }
 
 /// RemoveExtraEdges - Remove true / false edges if either / both are no longer
@@ -849,6 +916,66 @@ void IfConverter::RemoveExtraEdges(BBInfo &BBI) {
     BBI.BB->CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
 }
 
+/// InitPredRedefs / UpdatePredRedefs - Defs by predicated instructions are
+/// modeled as read + write (sort like two-address instructions). These
+/// routines track register liveness and add implicit uses to if-converted
+/// instructions to conform to the model.
+static void InitPredRedefs(MachineBasicBlock *BB, SmallSet<unsigned,4> &Redefs,
+                           const TargetRegisterInfo *TRI) {
+  for (MachineBasicBlock::livein_iterator I = BB->livein_begin(),
+         E = BB->livein_end(); I != E; ++I) {
+    unsigned Reg = *I;
+    Redefs.insert(Reg);
+    for (const unsigned *Subreg = TRI->getSubRegisters(Reg);
+         *Subreg; ++Subreg)
+      Redefs.insert(*Subreg);
+  }
+}
+
+static void UpdatePredRedefs(MachineInstr *MI, SmallSet<unsigned,4> &Redefs,
+                             const TargetRegisterInfo *TRI,
+                             bool AddImpUse = false) {
+  SmallVector<unsigned, 4> Defs;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isDef())
+      Defs.push_back(Reg);
+    else if (MO.isKill()) {
+      Redefs.erase(Reg);
+      for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
+        Redefs.erase(*SR);
+    }
+  }
+  for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
+    unsigned Reg = Defs[i];
+    if (Redefs.count(Reg)) {
+      if (AddImpUse)
+        // Treat predicated update as read + write.
+        MI->addOperand(MachineOperand::CreateReg(Reg, false/*IsDef*/,
+                                                true/*IsImp*/,false/*IsKill*/));
+    } else {
+      Redefs.insert(Reg);
+      for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
+        Redefs.insert(*SR);
+    }
+  }
+}
+
+static void UpdatePredRedefs(MachineBasicBlock::iterator I,
+                             MachineBasicBlock::iterator E,
+                             SmallSet<unsigned,4> &Redefs,
+                             const TargetRegisterInfo *TRI) {
+  while (I != E) {
+    UpdatePredRedefs(I, Redefs, TRI);
+    ++I;
+  }
+}
+
 /// IfConvertSimple - If convert a simple (split, no rejoin) sub-CFG.
 ///
 bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
@@ -873,13 +1000,19 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
     if (TII->ReverseBranchCondition(Cond))
       assert(false && "Unable to reverse branch condition!");
 
+  // Initialize liveins to the first BB. These are potentiall redefined by
+  // predicated instructions.
+  SmallSet<unsigned, 4> Redefs;
+  InitPredRedefs(CvtBBI->BB, Redefs, TRI);
+  InitPredRedefs(NextBBI->BB, Redefs, TRI);
+
   if (CvtBBI->BB->pred_size() > 1) {
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
     // Copy instructions in the true block, predicate them, and add them to
     // the entry block.
-    CopyAndPredicateBlock(BBI, *CvtBBI, Cond);
+    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, Redefs);
   } else {
-    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond);
+    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond, Redefs);
 
     // Merge converted block into entry block.
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
@@ -922,6 +1055,7 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
   BBInfo &FalseBBI = BBAnalysis[BBI.FalseBB->getNumber()];
   BBInfo *CvtBBI = &TrueBBI;
   BBInfo *NextBBI = &FalseBBI;
+  DebugLoc dl;  // FIXME: this is nowhere
 
   SmallVector<MachineOperand, 4> Cond(BBI.BrCond.begin(), BBI.BrCond.end());
   if (Kind == ICTriangleFalse || Kind == ICTriangleFRev)
@@ -957,21 +1091,26 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
     }
   }
 
+  // Initialize liveins to the first BB. These are potentially redefined by
+  // predicated instructions.
+  SmallSet<unsigned, 4> Redefs;
+  InitPredRedefs(CvtBBI->BB, Redefs, TRI);
+  InitPredRedefs(NextBBI->BB, Redefs, TRI);
+
   bool HasEarlyExit = CvtBBI->FalseBB != NULL;
-  bool DupBB = CvtBBI->BB->pred_size() > 1;
-  if (DupBB) {
+  if (CvtBBI->BB->pred_size() > 1) {
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
     // Copy instructions in the true block, predicate them, and add them to
     // the entry block.
-    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, true);
+    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, Redefs, true);
   } else {
     // Predicate the 'true' block after removing its branch.
     CvtBBI->NonPredSize -= TII->RemoveBranch(*CvtBBI->BB);
-    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond);
+    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond, Redefs);
 
     // Now merge the entry of the triangle with the true block.
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
-    MergeBlocks(BBI, *CvtBBI);
+    MergeBlocks(BBI, *CvtBBI, false);
   }
 
   // If 'true' block has a 'false' successor, add an exit branch to it.
@@ -980,7 +1119,7 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
                                            CvtBBI->BrCond.end());
     if (TII->ReverseBranchCondition(RevCond))
       assert(false && "Unable to reverse branch condition!");
-    TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, NULL, RevCond);
+    TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, NULL, RevCond, dl);
     BBI.BB->addSuccessor(CvtBBI->FalseBB);
   }
 
@@ -1009,7 +1148,7 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
   RemoveExtraEdges(BBI);
 
   // Update block info. BB can be iteratively if-converted.
-  if (!IterIfcvt) 
+  if (!IterIfcvt)
     BBI.IsDone = true;
   InvalidatePreds(BBI.BB);
   CvtBBI->IsDone = true;
@@ -1044,9 +1183,9 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
     return false;
   }
 
-  // Merge the 'true' and 'false' blocks by copying the instructions
-  // from the 'false' block to the 'true' block. That is, unless the true
-  // block would clobber the predicate, in that case, do the opposite.
+  // Put the predicated instructions from the 'true' block before the
+  // instructions from the 'false' block, unless the true block would clobber
+  // the predicate, in which case, do the opposite.
   BBInfo *BBI1 = &TrueBBI;
   BBInfo *BBI2 = &FalseBBI;
   SmallVector<MachineOperand, 4> RevCond(BBI.BrCond.begin(), BBI.BrCond.end());
@@ -1071,39 +1210,72 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
   // Remove the conditional branch from entry to the blocks.
   BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
 
+  // Initialize liveins to the first BB. These are potentially redefined by
+  // predicated instructions.
+  SmallSet<unsigned, 4> Redefs;
+  InitPredRedefs(BBI1->BB, Redefs, TRI);
+
   // Remove the duplicated instructions at the beginnings of both paths.
   MachineBasicBlock::iterator DI1 = BBI1->BB->begin();
   MachineBasicBlock::iterator DI2 = BBI2->BB->begin();
+  MachineBasicBlock::iterator DIE1 = BBI1->BB->end();
+  MachineBasicBlock::iterator DIE2 = BBI2->BB->end();
+  // Skip dbg_value instructions
+  while (DI1 != DIE1 && DI1->isDebugValue())
+    ++DI1;
+  while (DI2 != DIE2 && DI2->isDebugValue())
+    ++DI2;
   BBI1->NonPredSize -= NumDups1;
   BBI2->NonPredSize -= NumDups1;
+
+  // Skip past the dups on each side separately since there may be
+  // differing dbg_value entries.
+  for (unsigned i = 0; i < NumDups1; ++DI1) {
+    if (!DI1->isDebugValue())
+      ++i;
+  }
   while (NumDups1 != 0) {
-    ++DI1;
     ++DI2;
-    --NumDups1;
+    if (!DI2->isDebugValue())
+      --NumDups1;
   }
+
+  UpdatePredRedefs(BBI1->BB->begin(), DI1, Redefs, TRI);
   BBI.BB->splice(BBI.BB->end(), BBI1->BB, BBI1->BB->begin(), DI1);
   BBI2->BB->erase(BBI2->BB->begin(), DI2);
 
   // Predicate the 'true' block after removing its branch.
   BBI1->NonPredSize -= TII->RemoveBranch(*BBI1->BB);
   DI1 = BBI1->BB->end();
-  for (unsigned i = 0; i != NumDups2; ++i)
+  for (unsigned i = 0; i != NumDups2; ) {
+    // NumDups2 only counted non-dbg_value instructions, so this won't
+    // run off the head of the list.
+    assert (DI1 != BBI1->BB->begin());
     --DI1;
+    // skip dbg_value instructions
+    if (!DI1->isDebugValue())
+      ++i;
+  }
   BBI1->BB->erase(DI1, BBI1->BB->end());
-  PredicateBlock(*BBI1, BBI1->BB->end(), *Cond1);
+  PredicateBlock(*BBI1, BBI1->BB->end(), *Cond1, Redefs);
 
   // Predicate the 'false' block.
   BBI2->NonPredSize -= TII->RemoveBranch(*BBI2->BB);
   DI2 = BBI2->BB->end();
   while (NumDups2 != 0) {
+    // NumDups2 only counted non-dbg_value instructions, so this won't
+    // run off the head of the list.
+    assert (DI2 != BBI2->BB->begin());
     --DI2;
-    --NumDups2;
+    // skip dbg_value instructions
+    if (!DI2->isDebugValue())
+      --NumDups2;
   }
-  PredicateBlock(*BBI2, DI2, *Cond2);
+  PredicateBlock(*BBI2, DI2, *Cond2, Redefs);
 
   // Merge the true block into the entry of the diamond.
-  MergeBlocks(BBI, *BBI1);
-  MergeBlocks(BBI, *BBI2);
+  MergeBlocks(BBI, *BBI1, TailBB == 0);
+  MergeBlocks(BBI, *BBI2, TailBB == 0);
 
   // If the if-converted block falls through or unconditionally branches into
   // the tail block, and the tail block does not have other predecessors, then
@@ -1111,16 +1283,32 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
   // tail, add a unconditional branch to it.
   if (TailBB) {
     BBInfo TailBBI = BBAnalysis[TailBB->getNumber()];
-    if (TailBB->pred_size() == 1 && !TailBBI.HasFallThrough) {
-      BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
+    bool CanMergeTail = !TailBBI.HasFallThrough;
+    // There may still be a fall-through edge from BBI1 or BBI2 to TailBB;
+    // check if there are any other predecessors besides those.
+    unsigned NumPreds = TailBB->pred_size();
+    if (NumPreds > 1)
+      CanMergeTail = false;
+    else if (NumPreds == 1 && CanMergeTail) {
+      MachineBasicBlock::pred_iterator PI = TailBB->pred_begin();
+      if (*PI != BBI1->BB && *PI != BBI2->BB)
+        CanMergeTail = false;
+    }
+    if (CanMergeTail) {
       MergeBlocks(BBI, TailBBI);
       TailBBI.IsDone = true;
     } else {
+      BBI.BB->addSuccessor(TailBB);
       InsertUncondBranch(BBI.BB, TailBB, TII);
       BBI.HasFallThrough = false;
     }
   }
 
+  // RemoveExtraEdges won't work if the block has an unanalyzable branch,
+  // which can happen here if TailBB is unanalyzable and is merged, so
+  // explicitly remove BBI1 and BBI2 as successors.
+  BBI.BB->removeSuccessor(BBI1->BB);
+  BBI.BB->removeSuccessor(BBI2->BB);
   RemoveExtraEdges(BBI);
 
   // Update block info.
@@ -1135,9 +1323,10 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
 /// specified end with the specified condition.
 void IfConverter::PredicateBlock(BBInfo &BBI,
                                  MachineBasicBlock::iterator E,
-                                 SmallVectorImpl<MachineOperand> &Cond) {
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 SmallSet<unsigned, 4> &Redefs) {
   for (MachineBasicBlock::iterator I = BBI.BB->begin(); I != E; ++I) {
-    if (TII->isPredicated(I))
+    if (I->isDebugValue() || TII->isPredicated(I))
       continue;
     if (!TII->PredicateInstruction(I, Cond)) {
 #ifndef NDEBUG
@@ -1145,6 +1334,10 @@ void IfConverter::PredicateBlock(BBInfo &BBI,
 #endif
       llvm_unreachable(0);
     }
+
+    // If the predicated instruction now redefines a register as the result of
+    // if-conversion, add an implicit kill.
+    UpdatePredRedefs(I, Redefs, TRI, true);
   }
 
   std::copy(Cond.begin(), Cond.end(), std::back_inserter(BBI.Predicate));
@@ -1152,48 +1345,55 @@ void IfConverter::PredicateBlock(BBInfo &BBI,
   BBI.IsAnalyzed = false;
   BBI.NonPredSize = 0;
 
-  NumIfConvBBs++;
+  ++NumIfConvBBs;
 }
 
 /// CopyAndPredicateBlock - Copy and predicate instructions from source BB to
 /// the destination block. Skip end of block branches if IgnoreBr is true.
 void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
                                         SmallVectorImpl<MachineOperand> &Cond,
+                                        SmallSet<unsigned, 4> &Redefs,
                                         bool IgnoreBr) {
   MachineFunction &MF = *ToBBI.BB->getParent();
 
   for (MachineBasicBlock::iterator I = FromBBI.BB->begin(),
          E = FromBBI.BB->end(); I != E; ++I) {
     const TargetInstrDesc &TID = I->getDesc();
-    bool isPredicated = TII->isPredicated(I);
     // Do not copy the end of the block branches.
-    if (IgnoreBr && !isPredicated && TID.isBranch())
+    if (IgnoreBr && TID.isBranch())
       break;
 
     MachineInstr *MI = MF.CloneMachineInstr(I);
     ToBBI.BB->insert(ToBBI.BB->end(), MI);
     ToBBI.NonPredSize++;
 
-    if (!isPredicated)
+    if (!TII->isPredicated(I) && !MI->isDebugValue()) {
       if (!TII->PredicateInstruction(MI, Cond)) {
 #ifndef NDEBUG
         dbgs() << "Unable to predicate " << *I << "!\n";
 #endif
         llvm_unreachable(0);
       }
+    }
+
+    // If the predicated instruction now redefines a register as the result of
+    // if-conversion, add an implicit kill.
+    UpdatePredRedefs(MI, Redefs, TRI, true);
   }
 
-  std::vector<MachineBasicBlock *> Succs(FromBBI.BB->succ_begin(),
-                                         FromBBI.BB->succ_end());
-  MachineBasicBlock *NBB = getNextBlock(FromBBI.BB);
-  MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : NULL;
+  if (!IgnoreBr) {
+    std::vector<MachineBasicBlock *> Succs(FromBBI.BB->succ_begin(),
+                                           FromBBI.BB->succ_end());
+    MachineBasicBlock *NBB = getNextBlock(FromBBI.BB);
+    MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : NULL;
 
-  for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
-    MachineBasicBlock *Succ = Succs[i];
-    // Fallthrough edge can't be transferred.
-    if (Succ == FallThrough)
-      continue;
-    ToBBI.BB->addSuccessor(Succ);
+    for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
+      MachineBasicBlock *Succ = Succs[i];
+      // Fallthrough edge can't be transferred.
+      if (Succ == FallThrough)
+        continue;
+      ToBBI.BB->addSuccessor(Succ);
+    }
   }
 
   std::copy(FromBBI.Predicate.begin(), FromBBI.Predicate.end(),
@@ -1203,25 +1403,18 @@ void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
   ToBBI.ClobbersPred |= FromBBI.ClobbersPred;
   ToBBI.IsAnalyzed = false;
 
-  NumDupBBs++;
+  ++NumDupBBs;
 }
 
 /// MergeBlocks - Move all instructions from FromBB to the end of ToBB.
-///
-void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI) {
+/// This will leave FromBB as an empty block, so remove all of its
+/// successor edges except for the fall-through edge.  If AddEdges is true,
+/// i.e., when FromBBI's branch is being moved, add those successor edges to
+/// ToBBI.
+void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
   ToBBI.BB->splice(ToBBI.BB->end(),
                    FromBBI.BB, FromBBI.BB->begin(), FromBBI.BB->end());
 
-  // Redirect all branches to FromBB to ToBB.
-  std::vector<MachineBasicBlock *> Preds(FromBBI.BB->pred_begin(),
-                                         FromBBI.BB->pred_end());
-  for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
-    MachineBasicBlock *Pred = Preds[i];
-    if (Pred == ToBBI.BB)
-      continue;
-    Pred->ReplaceUsesOfBlockWith(FromBBI.BB, ToBBI.BB);
-  }
- 
   std::vector<MachineBasicBlock *> Succs(FromBBI.BB->succ_begin(),
                                          FromBBI.BB->succ_end());
   MachineBasicBlock *NBB = getNextBlock(FromBBI.BB);
@@ -1233,7 +1426,8 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI) {
     if (Succ == FallThrough)
       continue;
     FromBBI.BB->removeSuccessor(Succ);
-    ToBBI.BB->addSuccessor(Succ);
+    if (AddEdges)
+      ToBBI.BB->addSuccessor(Succ);
   }
 
   // Now FromBBI always falls through to the next block!
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
new file mode 100644
index 000000000000..12adcaa3a22e
--- /dev/null
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -0,0 +1,408 @@
+//===-------- InlineSpiller.cpp - Insert spills and restores inline -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The inline spiller modifies the machine function directly instead of
+// inserting spills and restores in VirtRegMap.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "spiller"
+#include "Spiller.h"
+#include "VirtRegMap.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class InlineSpiller : public Spiller {
+  MachineFunction &mf_;
+  LiveIntervals &lis_;
+  VirtRegMap &vrm_;
+  MachineFrameInfo &mfi_;
+  MachineRegisterInfo &mri_;
+  const TargetInstrInfo &tii_;
+  const TargetRegisterInfo &tri_;
+  const BitVector reserved_;
+
+  // Variables that are valid during spill(), but used by multiple methods.
+  LiveInterval *li_;
+  std::vector<LiveInterval*> *newIntervals_;
+  const TargetRegisterClass *rc_;
+  int stackSlot_;
+  const SmallVectorImpl<LiveInterval*> *spillIs_;
+
+  // Values of the current interval that can potentially remat.
+  SmallPtrSet<VNInfo*, 8> reMattable_;
+
+  // Values in reMattable_ that failed to remat at some point.
+  SmallPtrSet<VNInfo*, 8> usedValues_;
+
+  ~InlineSpiller() {}
+
+public:
+  InlineSpiller(MachineFunction *mf, LiveIntervals *lis, VirtRegMap *vrm)
+    : mf_(*mf), lis_(*lis), vrm_(*vrm),
+      mfi_(*mf->getFrameInfo()),
+      mri_(mf->getRegInfo()),
+      tii_(*mf->getTarget().getInstrInfo()),
+      tri_(*mf->getTarget().getRegisterInfo()),
+      reserved_(tri_.getReservedRegs(mf_)) {}
+
+  void spill(LiveInterval *li,
+             std::vector<LiveInterval*> &newIntervals,
+             SmallVectorImpl<LiveInterval*> &spillIs,
+             SlotIndex *earliestIndex);
+
+private:
+  bool allUsesAvailableAt(const MachineInstr *OrigMI, SlotIndex OrigIdx,
+                          SlotIndex UseIdx);
+  bool reMaterializeFor(MachineBasicBlock::iterator MI);
+  void reMaterializeAll();
+
+  bool foldMemoryOperand(MachineBasicBlock::iterator MI,
+                         const SmallVectorImpl<unsigned> &Ops);
+  void insertReload(LiveInterval &NewLI, MachineBasicBlock::iterator MI);
+  void insertSpill(LiveInterval &NewLI, MachineBasicBlock::iterator MI);
+};
+}
+
+namespace llvm {
+Spiller *createInlineSpiller(MachineFunction *mf,
+                             LiveIntervals *lis,
+                             const MachineLoopInfo *mli,
+                             VirtRegMap *vrm) {
+  return new InlineSpiller(mf, lis, vrm);
+}
+}
+
+/// allUsesAvailableAt - Return true if all registers used by OrigMI at
+/// OrigIdx are also available with the same value at UseIdx.
+bool InlineSpiller::allUsesAvailableAt(const MachineInstr *OrigMI,
+                                       SlotIndex OrigIdx,
+                                       SlotIndex UseIdx) {
+  OrigIdx = OrigIdx.getUseIndex();
+  UseIdx = UseIdx.getUseIndex();
+  for (unsigned i = 0, e = OrigMI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = OrigMI->getOperand(i);
+    if (!MO.isReg() || !MO.getReg() || MO.getReg() == li_->reg)
+      continue;
+    // Reserved registers are OK.
+    if (MO.isUndef() || !lis_.hasInterval(MO.getReg()))
+      continue;
+    // We don't want to move any defs.
+    if (MO.isDef())
+      return false;
+    // We cannot depend on virtual registers in spillIs_. They will be spilled.
+    for (unsigned si = 0, se = spillIs_->size(); si != se; ++si)
+      if ((*spillIs_)[si]->reg == MO.getReg())
+        return false;
+
+    LiveInterval &LI = lis_.getInterval(MO.getReg());
+    const VNInfo *OVNI = LI.getVNInfoAt(OrigIdx);
+    if (!OVNI)
+      continue;
+    if (OVNI != LI.getVNInfoAt(UseIdx))
+      return false;
+  }
+  return true;
+}
+
+/// reMaterializeFor - Attempt to rematerialize li_->reg before MI instead of
+/// reloading it.
+bool InlineSpiller::reMaterializeFor(MachineBasicBlock::iterator MI) {
+  SlotIndex UseIdx = lis_.getInstructionIndex(MI).getUseIndex();
+  VNInfo *OrigVNI = li_->getVNInfoAt(UseIdx);
+  if (!OrigVNI) {
+    DEBUG(dbgs() << "\tadding <undef> flags: ");
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isUse() && MO.getReg() == li_->reg)
+        MO.setIsUndef();
+    }
+    DEBUG(dbgs() << UseIdx << '\t' << *MI);
+    return true;
+  }
+  if (!reMattable_.count(OrigVNI)) {
+    DEBUG(dbgs() << "\tusing non-remat valno " << OrigVNI->id << ": "
+                 << UseIdx << '\t' << *MI);
+    return false;
+  }
+  MachineInstr *OrigMI = lis_.getInstructionFromIndex(OrigVNI->def);
+  if (!allUsesAvailableAt(OrigMI, OrigVNI->def, UseIdx)) {
+    usedValues_.insert(OrigVNI);
+    DEBUG(dbgs() << "\tcannot remat for " << UseIdx << '\t' << *MI);
+    return false;
+  }
+
+  // If the instruction also writes li_->reg, it had better not require the same
+  // register for uses and defs.
+  bool Reads, Writes;
+  SmallVector<unsigned, 8> Ops;
+  tie(Reads, Writes) = MI->readsWritesVirtualRegister(li_->reg, &Ops);
+  if (Writes) {
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(Ops[i]);
+      if (MO.isUse() ? MI->isRegTiedToDefOperand(Ops[i]) : MO.getSubReg()) {
+        usedValues_.insert(OrigVNI);
+        DEBUG(dbgs() << "\tcannot remat tied reg: " << UseIdx << '\t' << *MI);
+        return false;
+      }
+    }
+  }
+
+  // Alocate a new register for the remat.
+  unsigned NewVReg = mri_.createVirtualRegister(rc_);
+  vrm_.grow();
+  LiveInterval &NewLI = lis_.getOrCreateInterval(NewVReg);
+  NewLI.markNotSpillable();
+  newIntervals_->push_back(&NewLI);
+
+  // Finally we can rematerialize OrigMI before MI.
+  MachineBasicBlock &MBB = *MI->getParent();
+  tii_.reMaterialize(MBB, MI, NewLI.reg, 0, OrigMI, tri_);
+  MachineBasicBlock::iterator RematMI = MI;
+  SlotIndex DefIdx = lis_.InsertMachineInstrInMaps(--RematMI).getDefIndex();
+  DEBUG(dbgs() << "\tremat:  " << DefIdx << '\t' << *RematMI);
+
+  // Replace operands
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(Ops[i]);
+    if (MO.isReg() && MO.isUse() && MO.getReg() == li_->reg) {
+      MO.setReg(NewVReg);
+      MO.setIsKill();
+    }
+  }
+  DEBUG(dbgs() << "\t        " << UseIdx << '\t' << *MI);
+
+  VNInfo *DefVNI = NewLI.getNextValue(DefIdx, 0, true,
+                                       lis_.getVNInfoAllocator());
+  NewLI.addRange(LiveRange(DefIdx, UseIdx.getDefIndex(), DefVNI));
+  DEBUG(dbgs() << "\tinterval: " << NewLI << '\n');
+  return true;
+}
+
+/// reMaterializeAll - Try to rematerialize as many uses of li_ as possible,
+/// and trim the live ranges after.
+void InlineSpiller::reMaterializeAll() {
+  // Do a quick scan of the interval values to find if any are remattable.
+  reMattable_.clear();
+  usedValues_.clear();
+  for (LiveInterval::const_vni_iterator I = li_->vni_begin(),
+       E = li_->vni_end(); I != E; ++I) {
+    VNInfo *VNI = *I;
+    if (VNI->isUnused() || !VNI->isDefAccurate())
+      continue;
+    MachineInstr *DefMI = lis_.getInstructionFromIndex(VNI->def);
+    if (!DefMI || !tii_.isTriviallyReMaterializable(DefMI))
+      continue;
+    reMattable_.insert(VNI);
+  }
+
+  // Often, no defs are remattable.
+  if (reMattable_.empty())
+    return;
+
+  // Try to remat before all uses of li_->reg.
+  bool anyRemat = false;
+  for (MachineRegisterInfo::use_nodbg_iterator
+       RI = mri_.use_nodbg_begin(li_->reg);
+       MachineInstr *MI = RI.skipInstruction();)
+     anyRemat |= reMaterializeFor(MI);
+
+  if (!anyRemat)
+    return;
+
+  // Remove any values that were completely rematted.
+  bool anyRemoved = false;
+  for (SmallPtrSet<VNInfo*, 8>::iterator I = reMattable_.begin(),
+       E = reMattable_.end(); I != E; ++I) {
+    VNInfo *VNI = *I;
+    if (VNI->hasPHIKill() || usedValues_.count(VNI))
+      continue;
+    MachineInstr *DefMI = lis_.getInstructionFromIndex(VNI->def);
+    DEBUG(dbgs() << "\tremoving dead def: " << VNI->def << '\t' << *DefMI);
+    lis_.RemoveMachineInstrFromMaps(DefMI);
+    vrm_.RemoveMachineInstrFromMaps(DefMI);
+    DefMI->eraseFromParent();
+    li_->removeValNo(VNI);
+    anyRemoved = true;
+  }
+
+  if (!anyRemoved)
+    return;
+
+  // Removing values may cause debug uses where li_ is not live.
+  for (MachineRegisterInfo::use_iterator RI = mri_.use_begin(li_->reg);
+       MachineInstr *MI = RI.skipInstruction();) {
+    if (!MI->isDebugValue())
+      continue;
+    // Try to preserve the debug value if li_ is live immediately after it.
+    MachineBasicBlock::iterator NextMI = MI;
+    ++NextMI;
+    if (NextMI != MI->getParent()->end() && !lis_.isNotInMIMap(NextMI)) {
+      SlotIndex NearIdx = lis_.getInstructionIndex(NextMI);
+      if (li_->liveAt(NearIdx))
+        continue;
+    }
+    DEBUG(dbgs() << "Removing debug info due to remat:" << "\t" << *MI);
+    MI->eraseFromParent();
+  }
+}
+
+/// foldMemoryOperand - Try folding stack slot references in Ops into MI.
+/// Return true on success, and MI will be erased.
+bool InlineSpiller::foldMemoryOperand(MachineBasicBlock::iterator MI,
+                                      const SmallVectorImpl<unsigned> &Ops) {
+  // TargetInstrInfo::foldMemoryOperand only expects explicit, non-tied
+  // operands.
+  SmallVector<unsigned, 8> FoldOps;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    unsigned Idx = Ops[i];
+    MachineOperand &MO = MI->getOperand(Idx);
+    if (MO.isImplicit())
+      continue;
+    // FIXME: Teach targets to deal with subregs.
+    if (MO.getSubReg())
+      return false;
+    // Tied use operands should not be passed to foldMemoryOperand.
+    if (!MI->isRegTiedToDefOperand(Idx))
+      FoldOps.push_back(Idx);
+  }
+
+  MachineInstr *FoldMI = tii_.foldMemoryOperand(MI, FoldOps, stackSlot_);
+  if (!FoldMI)
+    return false;
+  lis_.ReplaceMachineInstrInMaps(MI, FoldMI);
+  vrm_.addSpillSlotUse(stackSlot_, FoldMI);
+  MI->eraseFromParent();
+  DEBUG(dbgs() << "\tfolded: " << *FoldMI);
+  return true;
+}
+
+/// insertReload - Insert a reload of NewLI.reg before MI.
+void InlineSpiller::insertReload(LiveInterval &NewLI,
+                                 MachineBasicBlock::iterator MI) {
+  MachineBasicBlock &MBB = *MI->getParent();
+  SlotIndex Idx = lis_.getInstructionIndex(MI).getDefIndex();
+  tii_.loadRegFromStackSlot(MBB, MI, NewLI.reg, stackSlot_, rc_, &tri_);
+  --MI; // Point to load instruction.
+  SlotIndex LoadIdx = lis_.InsertMachineInstrInMaps(MI).getDefIndex();
+  vrm_.addSpillSlotUse(stackSlot_, MI);
+  DEBUG(dbgs() << "\treload:  " << LoadIdx << '\t' << *MI);
+  VNInfo *LoadVNI = NewLI.getNextValue(LoadIdx, 0, true,
+                                       lis_.getVNInfoAllocator());
+  NewLI.addRange(LiveRange(LoadIdx, Idx, LoadVNI));
+}
+
+/// insertSpill - Insert a spill of NewLI.reg after MI.
+void InlineSpiller::insertSpill(LiveInterval &NewLI,
+                                MachineBasicBlock::iterator MI) {
+  MachineBasicBlock &MBB = *MI->getParent();
+  SlotIndex Idx = lis_.getInstructionIndex(MI).getDefIndex();
+  tii_.storeRegToStackSlot(MBB, ++MI, NewLI.reg, true, stackSlot_, rc_, &tri_);
+  --MI; // Point to store instruction.
+  SlotIndex StoreIdx = lis_.InsertMachineInstrInMaps(MI).getDefIndex();
+  vrm_.addSpillSlotUse(stackSlot_, MI);
+  DEBUG(dbgs() << "\tspilled: " << StoreIdx << '\t' << *MI);
+  VNInfo *StoreVNI = NewLI.getNextValue(Idx, 0, true,
+                                        lis_.getVNInfoAllocator());
+  NewLI.addRange(LiveRange(Idx, StoreIdx, StoreVNI));
+}
+
+void InlineSpiller::spill(LiveInterval *li,
+                          std::vector<LiveInterval*> &newIntervals,
+                          SmallVectorImpl<LiveInterval*> &spillIs,
+                          SlotIndex *earliestIndex) {
+  DEBUG(dbgs() << "Inline spilling " << *li << "\n");
+  assert(li->isSpillable() && "Attempting to spill already spilled value.");
+  assert(!li->isStackSlot() && "Trying to spill a stack slot.");
+
+  li_ = li;
+  newIntervals_ = &newIntervals;
+  rc_ = mri_.getRegClass(li->reg);
+  spillIs_ = &spillIs;
+
+  reMaterializeAll();
+
+  // Remat may handle everything.
+  if (li_->empty())
+    return;
+
+  stackSlot_ = vrm_.assignVirt2StackSlot(li->reg);
+
+  // Iterate over instructions using register.
+  for (MachineRegisterInfo::reg_iterator RI = mri_.reg_begin(li->reg);
+       MachineInstr *MI = RI.skipInstruction();) {
+
+    // Debug values are not allowed to affect codegen.
+    if (MI->isDebugValue()) {
+      // Modify DBG_VALUE now that the value is in a spill slot.
+      uint64_t Offset = MI->getOperand(1).getImm();
+      const MDNode *MDPtr = MI->getOperand(2).getMetadata();
+      DebugLoc DL = MI->getDebugLoc();
+      if (MachineInstr *NewDV = tii_.emitFrameIndexDebugValue(mf_, stackSlot_,
+                                                           Offset, MDPtr, DL)) {
+        DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << *MI);
+        MachineBasicBlock *MBB = MI->getParent();
+        MBB->insert(MBB->erase(MI), NewDV);
+      } else {
+        DEBUG(dbgs() << "Removing debug info due to spill:" << "\t" << *MI);
+        MI->eraseFromParent();
+      }
+      continue;
+    }
+
+    // Analyze instruction.
+    bool Reads, Writes;
+    SmallVector<unsigned, 8> Ops;
+    tie(Reads, Writes) = MI->readsWritesVirtualRegister(li->reg, &Ops);
+
+    // Attempt to fold memory ops.
+    if (foldMemoryOperand(MI, Ops))
+      continue;
+
+    // Allocate interval around instruction.
+    // FIXME: Infer regclass from instruction alone.
+    unsigned NewVReg = mri_.createVirtualRegister(rc_);
+    vrm_.grow();
+    LiveInterval &NewLI = lis_.getOrCreateInterval(NewVReg);
+    NewLI.markNotSpillable();
+
+    if (Reads)
+      insertReload(NewLI, MI);
+
+    // Rewrite instruction operands.
+    bool hasLiveDef = false;
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(Ops[i]);
+      MO.setReg(NewVReg);
+      if (MO.isUse()) {
+        if (!MI->isRegTiedToDefOperand(Ops[i]))
+          MO.setIsKill();
+      } else {
+        if (!MO.isDead())
+          hasLiveDef = true;
+      }
+    }
+
+    // FIXME: Use a second vreg if instruction has no tied ops.
+    if (Writes && hasLiveDef)
+      insertSpill(NewLI, MI);
+
+    DEBUG(dbgs() << "\tinterval: " << NewLI << '\n');
+    newIntervals.push_back(&NewLI);
+  }
+}
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index 63bb5f21f8f1..03ae214ae7da 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Module.h"
 #include "llvm/Type.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/raw_ostream.h"
@@ -314,21 +315,22 @@ static Value *LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP) {
 static void ReplaceFPIntrinsicWithCall(CallInst *CI, const char *Fname,
                                        const char *Dname,
                                        const char *LDname) {
-  switch (CI->getOperand(1)->getType()->getTypeID()) {
+  CallSite CS(CI);
+  switch (CI->getArgOperand(0)->getType()->getTypeID()) {
   default: llvm_unreachable("Invalid type in intrinsic");
   case Type::FloatTyID:
-    ReplaceCallWith(Fname, CI, CI->op_begin() + 1, CI->op_end(),
+    ReplaceCallWith(Fname, CI, CS.arg_begin(), CS.arg_end(),
                   Type::getFloatTy(CI->getContext()));
     break;
   case Type::DoubleTyID:
-    ReplaceCallWith(Dname, CI, CI->op_begin() + 1, CI->op_end(),
+    ReplaceCallWith(Dname, CI, CS.arg_begin(), CS.arg_end(),
                   Type::getDoubleTy(CI->getContext()));
     break;
   case Type::X86_FP80TyID:
   case Type::FP128TyID:
   case Type::PPC_FP128TyID:
-    ReplaceCallWith(LDname, CI, CI->op_begin() + 1, CI->op_end(),
-                  CI->getOperand(1)->getType());
+    ReplaceCallWith(LDname, CI, CS.arg_begin(), CS.arg_end(),
+                  CI->getArgOperand(0)->getType());
     break;
   }
 }
@@ -340,6 +342,7 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
   const Function *Callee = CI->getCalledFunction();
   assert(Callee && "Cannot lower an indirect call!");
 
+  CallSite CS(CI);
   switch (Callee->getIntrinsicID()) {
   case Intrinsic::not_intrinsic:
     report_fatal_error("Cannot lower a call to a non-intrinsic function '"+
@@ -353,7 +356,7 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     // by the lowerinvoke pass.  In both cases, the right thing to do is to
     // convert the call to an explicit setjmp or longjmp call.
   case Intrinsic::setjmp: {
-    Value *V = ReplaceCallWith("setjmp", CI, CI->op_begin() + 1, CI->op_end(),
+    Value *V = ReplaceCallWith("setjmp", CI, CS.arg_begin(), CS.arg_end(),
                                Type::getInt32Ty(Context));
     if (!CI->getType()->isVoidTy())
       CI->replaceAllUsesWith(V);
@@ -365,32 +368,32 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
      break;
 
   case Intrinsic::longjmp: {
-    ReplaceCallWith("longjmp", CI, CI->op_begin() + 1, CI->op_end(),
+    ReplaceCallWith("longjmp", CI, CS.arg_begin(), CS.arg_end(),
                     Type::getVoidTy(Context));
     break;
   }
 
   case Intrinsic::siglongjmp: {
     // Insert the call to abort
-    ReplaceCallWith("abort", CI, CI->op_end(), CI->op_end(), 
+    ReplaceCallWith("abort", CI, CS.arg_end(), CS.arg_end(), 
                     Type::getVoidTy(Context));
     break;
   }
   case Intrinsic::ctpop:
-    CI->replaceAllUsesWith(LowerCTPOP(Context, CI->getOperand(1), CI));
+    CI->replaceAllUsesWith(LowerCTPOP(Context, CI->getArgOperand(0), CI));
     break;
 
   case Intrinsic::bswap:
-    CI->replaceAllUsesWith(LowerBSWAP(Context, CI->getOperand(1), CI));
+    CI->replaceAllUsesWith(LowerBSWAP(Context, CI->getArgOperand(0), CI));
     break;
     
   case Intrinsic::ctlz:
-    CI->replaceAllUsesWith(LowerCTLZ(Context, CI->getOperand(1), CI));
+    CI->replaceAllUsesWith(LowerCTLZ(Context, CI->getArgOperand(0), CI));
     break;
 
   case Intrinsic::cttz: {
     // cttz(x) -> ctpop(~X & (X-1))
-    Value *Src = CI->getOperand(1);
+    Value *Src = CI->getArgOperand(0);
     Value *NotSrc = Builder.CreateNot(Src);
     NotSrc->setName(Src->getName() + ".not");
     Value *SrcM1 = ConstantInt::get(Src->getType(), 1);
@@ -451,37 +454,37 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     
   case Intrinsic::memcpy: {
     const IntegerType *IntPtr = TD.getIntPtrType(Context);
-    Value *Size = Builder.CreateIntCast(CI->getOperand(3), IntPtr,
+    Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                         /* isSigned */ false);
     Value *Ops[3];
-    Ops[0] = CI->getOperand(1);
-    Ops[1] = CI->getOperand(2);
+    Ops[0] = CI->getArgOperand(0);
+    Ops[1] = CI->getArgOperand(1);
     Ops[2] = Size;
-    ReplaceCallWith("memcpy", CI, Ops, Ops+3, CI->getOperand(1)->getType());
+    ReplaceCallWith("memcpy", CI, Ops, Ops+3, CI->getArgOperand(0)->getType());
     break;
   }
   case Intrinsic::memmove: {
     const IntegerType *IntPtr = TD.getIntPtrType(Context);
-    Value *Size = Builder.CreateIntCast(CI->getOperand(3), IntPtr,
+    Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                         /* isSigned */ false);
     Value *Ops[3];
-    Ops[0] = CI->getOperand(1);
-    Ops[1] = CI->getOperand(2);
+    Ops[0] = CI->getArgOperand(0);
+    Ops[1] = CI->getArgOperand(1);
     Ops[2] = Size;
-    ReplaceCallWith("memmove", CI, Ops, Ops+3, CI->getOperand(1)->getType());
+    ReplaceCallWith("memmove", CI, Ops, Ops+3, CI->getArgOperand(0)->getType());
     break;
   }
   case Intrinsic::memset: {
     const IntegerType *IntPtr = TD.getIntPtrType(Context);
-    Value *Size = Builder.CreateIntCast(CI->getOperand(3), IntPtr,
+    Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                         /* isSigned */ false);
     Value *Ops[3];
-    Ops[0] = CI->getOperand(1);
+    Ops[0] = CI->getArgOperand(0);
     // Extend the amount to i32.
-    Ops[1] = Builder.CreateIntCast(CI->getOperand(2), Type::getInt32Ty(Context),
+    Ops[1] = Builder.CreateIntCast(CI->getArgOperand(1), Type::getInt32Ty(Context),
                                    /* isSigned */ false);
     Ops[2] = Size;
-    ReplaceCallWith("memset", CI, Ops, Ops+3, CI->getOperand(1)->getType());
+    ReplaceCallWith("memset", CI, Ops, Ops+3, CI->getArgOperand(0)->getType());
     break;
   }
   case Intrinsic::sqrt: {
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index b584704bff3d..bf3137e49536 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -329,12 +329,15 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   if (OptLevel != CodeGenOpt::None)
     PM.add(createOptimizePHIsPass());
 
-  // Delete dead machine instructions regardless of optimization level.
-  PM.add(createDeadMachineInstructionElimPass());
-  printAndVerify(PM, "After codegen DCE pass",
-                 /* allowDoubleDefs= */ true);
-
   if (OptLevel != CodeGenOpt::None) {
+    // With optimization, dead code should already be eliminated. However
+    // there is one known exception: lowered code for arguments that are only
+    // used by tail calls, where the tail calls reuse the incoming stack
+    // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
+    PM.add(createDeadMachineInstructionElimPass());
+    printAndVerify(PM, "After codegen DCE pass",
+                   /* allowDoubleDefs= */ true);
+
     PM.add(createOptimizeExtsPass());
     if (!DisableMachineLICM)
       PM.add(createMachineLICMPass());
@@ -358,7 +361,7 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
                    /* allowDoubleDefs= */ true);
 
   // Perform register allocation.
-  PM.add(createRegisterAllocator());
+  PM.add(createRegisterAllocator(OptLevel));
   printAndVerify(PM, "After Register Allocation");
 
   // Perform stack slot coloring and post-ra machine LICM.
diff --git a/lib/CodeGen/LatencyPriorityQueue.cpp b/lib/CodeGen/LatencyPriorityQueue.cpp
index 03b4eab93dca..b9527fafbee8 100644
--- a/lib/CodeGen/LatencyPriorityQueue.cpp
+++ b/lib/CodeGen/LatencyPriorityQueue.cpp
@@ -118,7 +118,7 @@ void LatencyPriorityQueue::AdjustPriorityOfUnscheduledPreds(SUnit *SU) {
 SUnit *LatencyPriorityQueue::pop() {
   if (empty()) return NULL;
   std::vector<SUnit *>::iterator Best = Queue.begin();
-  for (std::vector<SUnit *>::iterator I = next(Queue.begin()),
+  for (std::vector<SUnit *>::iterator I = llvm::next(Queue.begin()),
        E = Queue.end(); I != E; ++I)
     if (Picker(*Best, *I))
       Best = I;
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index 025ad0538f2c..21a9b7d4db6f 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -68,6 +68,37 @@ bool LiveInterval::liveBeforeAndAt(SlotIndex I) const {
   return r->end == I;
 }
 
+/// killedAt - Return true if a live range ends at index. Note that the kill
+/// point is not contained in the half-open live range. It is usually the
+/// getDefIndex() slot following its last use.
+bool LiveInterval::killedAt(SlotIndex I) const {
+  Ranges::const_iterator r = std::lower_bound(ranges.begin(), ranges.end(), I);
+
+  // Now r points to the first interval with start >= I, or ranges.end().
+  if (r == ranges.begin())
+    return false;
+
+  --r;
+  // Now r points to the last interval with end <= I.
+  // r->end is the kill point.
+  return r->end == I;
+}
+
+/// killedInRange - Return true if the interval has kills in [Start,End).
+bool LiveInterval::killedInRange(SlotIndex Start, SlotIndex End) const {
+  Ranges::const_iterator r =
+    std::lower_bound(ranges.begin(), ranges.end(), End);
+
+  // Now r points to the first interval with start >= End, or ranges.end().
+  if (r == ranges.begin())
+    return false;
+
+  --r;
+  // Now r points to the last interval with end <= End.
+  // r->end is the kill point.
+  return r->end >= Start && r->end < End;
+}
+
 // overlaps - Return true if the intersection of the two live intervals is
 // not empty.
 //
@@ -149,7 +180,6 @@ bool LiveInterval::overlaps(SlotIndex Start, SlotIndex End) const {
 void LiveInterval::extendIntervalEndTo(Ranges::iterator I, SlotIndex NewEnd) {
   assert(I != ranges.end() && "Not a valid interval!");
   VNInfo *ValNo = I->valno;
-  SlotIndex OldEnd = I->end;
 
   // Search for the first interval that we can't merge with.
   Ranges::iterator MergeTo = next(I);
@@ -163,9 +193,6 @@ void LiveInterval::extendIntervalEndTo(Ranges::iterator I, SlotIndex NewEnd) {
   // Erase any dead ranges.
   ranges.erase(next(I), MergeTo);
 
-  // Update kill info.
-  ValNo->removeKills(OldEnd, I->end.getPrevSlot());
-
   // If the newly formed range now touches the range after it and if they have
   // the same value number, merge the two ranges into one range.
   Ranges::iterator Next = next(I);
@@ -245,9 +272,6 @@ LiveInterval::addRangeFrom(LiveRange LR, iterator From) {
         // endpoint as well.
         if (End > it->end)
           extendIntervalEndTo(it, End);
-        else if (End < it->end)
-          // Overlapping intervals, there might have been a kill here.
-          it->valno->removeKill(End);
         return it;
       }
     } else {
@@ -288,7 +312,6 @@ void LiveInterval::removeRange(SlotIndex Start, SlotIndex End,
   VNInfo *ValNo = I->valno;
   if (I->start == Start) {
     if (I->end == End) {
-      ValNo->removeKills(Start, End);
       if (RemoveDeadValNo) {
         // Check if val# is dead.
         bool isDead = true;
@@ -296,7 +319,7 @@ void LiveInterval::removeRange(SlotIndex Start, SlotIndex End,
           if (II != I && II->valno == ValNo) {
             isDead = false;
             break;
-          }          
+          }
         if (isDead) {
           // Now that ValNo is dead, remove it.  If it is the largest value
           // number, just nuke it (and any other deleted values neighboring it),
@@ -320,7 +343,6 @@ void LiveInterval::removeRange(SlotIndex Start, SlotIndex End,
   // Otherwise if the span we are removing is at the end of the LiveRange,
   // adjust the other way.
   if (I->end == End) {
-    ValNo->removeKills(Start, End);
     I->end = Start;
     return;
   }
@@ -529,6 +551,7 @@ void LiveInterval::MergeValueInAsValue(
   SmallVector<VNInfo*, 4> ReplacedValNos;
   iterator IP = begin();
   for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) {
+    assert(I->valno == RHS.getValNumInfo(I->valno->id) && "Bad VNInfo");
     if (I->valno != RHSValNo)
       continue;
     SlotIndex Start = I->start, End = I->end;
@@ -823,10 +846,12 @@ void LiveInterval::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
   else {
     OS << " = ";
     for (LiveInterval::Ranges::const_iterator I = ranges.begin(),
-           E = ranges.end(); I != E; ++I)
-    OS << *I;
+           E = ranges.end(); I != E; ++I) {
+      OS << *I;
+      assert(I->valno == getValNumInfo(I->valno->id) && "Bad VNInfo");
+    }
   }
-  
+
   // Print value number info.
   if (getNumValNums()) {
     OS << "  ";
@@ -843,21 +868,6 @@ void LiveInterval::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
           OS << "?";
         else
           OS << vni->def;
-        unsigned ee = vni->kills.size();
-        if (ee || vni->hasPHIKill()) {
-          OS << "-(";
-          for (unsigned j = 0; j != ee; ++j) {
-            OS << vni->kills[j];
-            if (j != ee-1)
-              OS << " ";
-          }
-          if (vni->hasPHIKill()) {
-            if (ee)
-              OS << " ";
-            OS << "phi";
-          }
-          OS << ")";
-        }
       }
     }
   }
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index a6d38adeab04..194d03d8dbfb 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -50,9 +50,6 @@ using namespace llvm;
 static cl::opt<bool> DisableReMat("disable-rematerialization", 
                                   cl::init(false), cl::Hidden);
 
-static cl::opt<bool> EnableFastSpilling("fast-spill",
-                                        cl::init(false), cl::Hidden);
-
 STATISTIC(numIntervals , "Number of original intervals");
 STATISTIC(numFolds     , "Number of loads/stores folded into instructions");
 STATISTIC(numSplits    , "Number of intervals split");
@@ -90,8 +87,8 @@ void LiveIntervals::releaseMemory() {
   
   r2iMap_.clear();
 
-  // Release VNInfo memroy regions after all VNInfo objects are dtor'd.
-  VNInfoAllocator.DestroyAll();
+  // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
+  VNInfoAllocator.Reset();
   while (!CloneMIs.empty()) {
     MachineInstr *MI = CloneMIs.back();
     CloneMIs.pop_back();
@@ -195,6 +192,10 @@ bool LiveIntervals::conflictsWithPhysReg(const LiveInterval &li,
     if (tii_->isMoveInstr(MI, SrcReg, DstReg, SrcSubReg, DstSubReg))
       if (SrcReg == li.reg || DstReg == li.reg)
         continue;
+    if (MI.isCopy())
+      if (MI.getOperand(0).getReg() == li.reg ||
+          MI.getOperand(1).getReg() == li.reg)
+        continue;
 
     // Check for operands using reg
     for (unsigned i = 0, e = MI.getNumOperands(); i != e;  ++i) {
@@ -218,10 +219,7 @@ bool LiveIntervals::conflictsWithPhysReg(const LiveInterval &li,
   return false;
 }
 
-/// conflictsWithSubPhysRegRef - Similar to conflictsWithPhysRegRef except
-/// it checks for sub-register reference and it can check use as well.
-bool LiveIntervals::conflictsWithSubPhysRegRef(LiveInterval &li,
-                                            unsigned Reg, bool CheckUse,
+bool LiveIntervals::conflictsWithAliasRef(LiveInterval &li, unsigned Reg,
                                   SmallPtrSet<MachineInstr*,32> &JoinedCopies) {
   for (LiveInterval::Ranges::const_iterator
          I = li.ranges.begin(), E = li.ranges.end(); I != E; ++I) {
@@ -239,12 +237,11 @@ bool LiveIntervals::conflictsWithSubPhysRegRef(LiveInterval &li,
         MachineOperand& MO = MI->getOperand(i);
         if (!MO.isReg())
           continue;
-        if (MO.isUse() && !CheckUse)
-          continue;
         unsigned PhysReg = MO.getReg();
-        if (PhysReg == 0 || TargetRegisterInfo::isVirtualRegister(PhysReg))
+        if (PhysReg == 0 || PhysReg == Reg ||
+            TargetRegisterInfo::isVirtualRegister(PhysReg))
           continue;
-        if (tri_->isSubRegister(Reg, PhysReg))
+        if (tri_->regsOverlap(Reg, PhysReg))
           return true;
       }
     }
@@ -272,7 +269,7 @@ bool MultipleDefsBySameMI(const MachineInstr &MI, unsigned MOIdx) {
     if (MO.getReg() == Reg && MO.isDef()) {
       assert(MI.getOperand(MOIdx).getSubReg() != MO.getSubReg() &&
              MI.getOperand(MOIdx).getSubReg() &&
-             MO.getSubReg());
+             (MO.getSubReg() || MO.isImplicit()));
       return true;
     }
   }
@@ -328,9 +325,10 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
 
     MachineInstr *CopyMI = NULL;
     unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-    if (mi->isExtractSubreg() || mi->isInsertSubreg() || mi->isSubregToReg() ||
-        tii_->isMoveInstr(*mi, SrcReg, DstReg, SrcSubReg, DstSubReg))
+    if (mi->isCopyLike() ||
+        tii_->isMoveInstr(*mi, SrcReg, DstReg, SrcSubReg, DstSubReg)) {
       CopyMI = mi;
+    }
 
     VNInfo *ValNo = interval.getNextValue(defIndex, CopyMI, true,
                                           VNInfoAllocator);
@@ -356,7 +354,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
         LiveRange LR(defIndex, killIdx, ValNo);
         interval.addRange(LR);
         DEBUG(dbgs() << " +" << LR << "\n");
-        ValNo->addKill(killIdx);
         return;
       }
     }
@@ -376,7 +373,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       // valno in the killing blocks.
       assert(vi.AliveBlocks.empty() && "Phi join can't pass through blocks");
       DEBUG(dbgs() << " phi-join");
-      ValNo->addKill(indexes_->getTerminatorGap(mbb));
       ValNo->setHasPHIKill(true);
     } else {
       // Iterate over all of the blocks that the variable is completely
@@ -407,7 +403,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       }
       LiveRange LR(Start, killIdx, ValNo);
       interval.addRange(LR);
-      ValNo->addKill(killIdx);
       DEBUG(dbgs() << " +" << LR);
     }
 
@@ -434,11 +429,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       // are actually two values in the live interval.  Because of this we
       // need to take the LiveRegion that defines this register and split it
       // into two values.
-      // Two-address vregs should always only be redefined once.  This means
-      // that at this point, there should be exactly one value number in it.
-      assert((PartReDef || interval.containsOneValue()) &&
-             "Unexpected 2-addr liveint!");
-      SlotIndex DefIndex = interval.getValNumInfo(0)->def.getDefIndex();
       SlotIndex RedefIndex = MIIdx.getDefIndex();
       if (MO.isEarlyClobber())
         RedefIndex = MIIdx.getUseIndex();
@@ -446,8 +436,9 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       const LiveRange *OldLR =
         interval.getLiveRangeContaining(RedefIndex.getUseIndex());
       VNInfo *OldValNo = OldLR->valno;
+      SlotIndex DefIndex = OldValNo->def.getDefIndex();
 
-      // Delete the initial value, which should be short and continuous,
+      // Delete the previous value, which should be short and continuous,
       // because the 2-addr copy must be in the same MBB as the redef.
       interval.removeRange(DefIndex, RedefIndex);
 
@@ -464,15 +455,14 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
 
       // A re-def may be a copy. e.g. %reg1030:6<def> = VMOVD %reg1026, ...
       unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-      if (PartReDef &&
-          tii_->isMoveInstr(*mi, SrcReg, DstReg, SrcSubReg, DstSubReg))
+      if (PartReDef && (mi->isCopyLike() ||
+          tii_->isMoveInstr(*mi, SrcReg, DstReg, SrcSubReg, DstSubReg)))
         OldValNo->setCopy(&*mi);
       
       // Add the new live interval which replaces the range for the input copy.
       LiveRange LR(DefIndex, RedefIndex, ValNo);
       DEBUG(dbgs() << " replace range with " << LR);
       interval.addRange(LR);
-      ValNo->addKill(RedefIndex);
 
       // If this redefinition is dead, we need to add a dummy unit live
       // range covering the def slot.
@@ -496,7 +486,7 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       VNInfo *ValNo;
       MachineInstr *CopyMI = NULL;
       unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-      if (mi->isExtractSubreg() || mi->isInsertSubreg() || mi->isSubregToReg()||
+      if (mi->isCopyLike() ||
           tii_->isMoveInstr(*mi, SrcReg, DstReg, SrcSubReg, DstSubReg))
         CopyMI = mi;
       ValNo = interval.getNextValue(defIndex, CopyMI, true, VNInfoAllocator);
@@ -504,7 +494,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       SlotIndex killIndex = getMBBEndIdx(mbb);
       LiveRange LR(defIndex, killIndex, ValNo);
       interval.addRange(LR);
-      ValNo->addKill(indexes_->getTerminatorGap(mbb));
       ValNo->setHasPHIKill(true);
       DEBUG(dbgs() << " phi-join +" << LR);
     } else {
@@ -600,7 +589,6 @@ exit:
     ValNo->setHasRedefByEC(true);
   LiveRange LR(start, end, ValNo);
   interval.addRange(LR);
-  LR.valno->addKill(end);
   DEBUG(dbgs() << " +" << LR << '\n');
 }
 
@@ -615,7 +603,7 @@ void LiveIntervals::handleRegisterDef(MachineBasicBlock *MBB,
   else if (allocatableRegs_[MO.getReg()]) {
     MachineInstr *CopyMI = NULL;
     unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-    if (MI->isExtractSubreg() || MI->isInsertSubreg() || MI->isSubregToReg() ||
+    if (MI->isCopyLike() ||
         tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubReg, DstSubReg))
       CopyMI = MI;
     handlePhysicalRegisterDef(MBB, MI, MIIdx, MO,
@@ -701,7 +689,6 @@ void LiveIntervals::handleLiveInRegister(MachineBasicBlock *MBB,
   LiveRange LR(start, end, vni);
 
   interval.addRange(LR);
-  LR.valno->addKill(end);
   DEBUG(dbgs() << " +" << LR << '\n');
 }
 
@@ -787,37 +774,6 @@ LiveInterval* LiveIntervals::dupInterval(LiveInterval *li) {
   return NewLI;
 }
 
-/// getVNInfoSourceReg - Helper function that parses the specified VNInfo
-/// copy field and returns the source register that defines it.
-unsigned LiveIntervals::getVNInfoSourceReg(const VNInfo *VNI) const {
-  if (!VNI->getCopy())
-    return 0;
-
-  if (VNI->getCopy()->isExtractSubreg()) {
-    // If it's extracting out of a physical register, return the sub-register.
-    unsigned Reg = VNI->getCopy()->getOperand(1).getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-      unsigned SrcSubReg = VNI->getCopy()->getOperand(2).getImm();
-      unsigned DstSubReg = VNI->getCopy()->getOperand(0).getSubReg();
-      if (SrcSubReg == DstSubReg)
-        // %reg1034:3<def> = EXTRACT_SUBREG %EDX, 3
-        // reg1034 can still be coalesced to EDX.
-        return Reg;
-      assert(DstSubReg == 0);
-      Reg = tri_->getSubReg(Reg, VNI->getCopy()->getOperand(2).getImm());
-    }
-    return Reg;
-  } else if (VNI->getCopy()->isInsertSubreg() ||
-             VNI->getCopy()->isSubregToReg())
-    return VNI->getCopy()->getOperand(2).getReg();
-
-  unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-  if (tii_->isMoveInstr(*VNI->getCopy(), SrcReg, DstReg, SrcSubReg, DstSubReg))
-    return SrcReg;
-  llvm_unreachable("Unrecognized copy instruction!");
-  return 0;
-}
-
 //===----------------------------------------------------------------------===//
 // Register allocator hooks.
 //
@@ -991,22 +947,22 @@ bool LiveIntervals::tryFoldMemoryOperand(MachineInstr* &MI,
   if (DefMI && (MRInfo & VirtRegMap::isMod))
     return false;
 
-  MachineInstr *fmi = isSS ? tii_->foldMemoryOperand(*mf_, MI, FoldOps, Slot)
-                           : tii_->foldMemoryOperand(*mf_, MI, FoldOps, DefMI);
+  MachineInstr *fmi = isSS ? tii_->foldMemoryOperand(MI, FoldOps, Slot)
+                           : tii_->foldMemoryOperand(MI, FoldOps, DefMI);
   if (fmi) {
     // Remember this instruction uses the spill slot.
     if (isSS) vrm.addSpillSlotUse(Slot, fmi);
 
     // Attempt to fold the memory reference into the instruction. If
     // we can do this, we don't need to insert spill code.
-    MachineBasicBlock &MBB = *MI->getParent();
     if (isSS && !mf_->getFrameInfo()->isImmutableObjectIndex(Slot))
       vrm.virtFolded(Reg, MI, fmi, (VirtRegMap::ModRef)MRInfo);
     vrm.transferSpillPts(MI, fmi);
     vrm.transferRestorePts(MI, fmi);
     vrm.transferEmergencySpills(MI, fmi);
     ReplaceMachineInstrInMaps(MI, fmi);
-    MI = MBB.insert(MBB.erase(MI), fmi);
+    MI->eraseFromParent();
+    MI = fmi;
     ++numFolds;
     return true;
   }
@@ -1098,7 +1054,6 @@ rewriteInstructionForSpills(const LiveInterval &li, const VNInfo *VNI,
     if (!mop.isReg())
       continue;
     unsigned Reg = mop.getReg();
-    unsigned RegI = Reg;
     if (Reg == 0 || TargetRegisterInfo::isPhysicalRegister(Reg))
       continue;
     if (Reg != li.reg)
@@ -1140,26 +1095,8 @@ rewriteInstructionForSpills(const LiveInterval &li, const VNInfo *VNI,
     //
     // Keep track of whether we replace a use and/or def so that we can
     // create the spill interval with the appropriate range. 
-
-    HasUse = mop.isUse();
-    HasDef = mop.isDef();
     SmallVector<unsigned, 2> Ops;
-    Ops.push_back(i);
-    for (unsigned j = i+1, e = MI->getNumOperands(); j != e; ++j) {
-      const MachineOperand &MOj = MI->getOperand(j);
-      if (!MOj.isReg())
-        continue;
-      unsigned RegJ = MOj.getReg();
-      if (RegJ == 0 || TargetRegisterInfo::isPhysicalRegister(RegJ))
-        continue;
-      if (RegJ == RegI) {
-        Ops.push_back(j);
-        if (!MOj.isUndef()) {
-          HasUse |= MOj.isUse();
-          HasDef |= MOj.isDef();
-        }
-      }
-    }
+    tie(HasUse, HasDef) = MI->readsWritesVirtualRegister(Reg, &Ops);
 
     // Create a new virtual register for the spill interval.
     // Create the new register now so we can map the fold instruction
@@ -1294,16 +1231,7 @@ bool LiveIntervals::anyKillInMBBAfterIdx(const LiveInterval &li,
                                    const VNInfo *VNI,
                                    MachineBasicBlock *MBB,
                                    SlotIndex Idx) const {
-  SlotIndex End = getMBBEndIdx(MBB);
-  for (unsigned j = 0, ee = VNI->kills.size(); j != ee; ++j) {
-    if (VNI->kills[j].isPHI())
-      continue;
-
-    SlotIndex KillIdx = VNI->kills[j];
-    if (KillIdx > Idx && KillIdx <= End)
-      return true;
-  }
-  return false;
+  return li.killedInRange(Idx.getNextSlot(), getMBBEndIdx(MBB));
 }
 
 /// RewriteInfo - Keep track of machine instrs that will be rewritten
@@ -1312,10 +1240,7 @@ namespace {
   struct RewriteInfo {
     SlotIndex Index;
     MachineInstr *MI;
-    bool HasUse;
-    bool HasDef;
-    RewriteInfo(SlotIndex i, MachineInstr *mi, bool u, bool d)
-      : Index(i), MI(mi), HasUse(u), HasDef(d) {}
+    RewriteInfo(SlotIndex i, MachineInstr *mi) : Index(i), MI(mi) {}
   };
 
   struct RewriteInfoCompare {
@@ -1394,7 +1319,7 @@ rewriteInstructionsForSpills(const LiveInterval &li, bool TrySplit,
       // easily see a situation where both registers are reloaded before
       // the INSERT_SUBREG and both target registers that would overlap.
       continue;
-    RewriteMIs.push_back(RewriteInfo(index, MI, O.isUse(), O.isDef()));
+    RewriteMIs.push_back(RewriteInfo(index, MI));
   }
   std::sort(RewriteMIs.begin(), RewriteMIs.end(), RewriteInfoCompare());
 
@@ -1404,18 +1329,11 @@ rewriteInstructionsForSpills(const LiveInterval &li, bool TrySplit,
     RewriteInfo &rwi = RewriteMIs[i];
     ++i;
     SlotIndex index = rwi.Index;
-    bool MIHasUse = rwi.HasUse;
-    bool MIHasDef = rwi.HasDef;
     MachineInstr *MI = rwi.MI;
     // If MI def and/or use the same register multiple times, then there
     // are multiple entries.
-    unsigned NumUses = MIHasUse;
     while (i != e && RewriteMIs[i].MI == MI) {
       assert(RewriteMIs[i].Index == index);
-      bool isUse = RewriteMIs[i].HasUse;
-      if (isUse) ++NumUses;
-      MIHasUse |= isUse;
-      MIHasDef |= RewriteMIs[i].HasDef;
       ++i;
     }
     MachineBasicBlock *MBB = MI->getParent();
@@ -1440,7 +1358,8 @@ rewriteInstructionsForSpills(const LiveInterval &li, bool TrySplit,
         //     = use
         // It's better to start a new interval to avoid artifically
         // extend the new interval.
-        if (MIHasDef && !MIHasUse) {
+        if (MI->readsWritesVirtualRegister(li.reg) ==
+            std::make_pair(false,true)) {
           MBBVRegsMap.erase(MBB->getNumber());
           ThisVReg = 0;
         }
@@ -1651,104 +1570,10 @@ LiveIntervals::normalizeSpillWeights(std::vector<LiveInterval*> &NewLIs) {
     normalizeSpillWeight(*NewLIs[i]);
 }
 
-std::vector<LiveInterval*> LiveIntervals::
-addIntervalsForSpillsFast(const LiveInterval &li,
-                          const MachineLoopInfo *loopInfo,
-                          VirtRegMap &vrm) {
-  unsigned slot = vrm.assignVirt2StackSlot(li.reg);
-
-  std::vector<LiveInterval*> added;
-
-  assert(li.isSpillable() && "attempt to spill already spilled interval!");
-
-  DEBUG({
-      dbgs() << "\t\t\t\tadding intervals for spills for interval: ";
-      li.dump();
-      dbgs() << '\n';
-    });
-
-  const TargetRegisterClass* rc = mri_->getRegClass(li.reg);
-
-  MachineRegisterInfo::reg_iterator RI = mri_->reg_begin(li.reg);
-  while (RI != mri_->reg_end()) {
-    MachineInstr* MI = &*RI;
-    
-    SmallVector<unsigned, 2> Indices;
-    bool HasUse = false;
-    bool HasDef = false;
-    
-    for (unsigned i = 0; i != MI->getNumOperands(); ++i) {
-      MachineOperand& mop = MI->getOperand(i);
-      if (!mop.isReg() || mop.getReg() != li.reg) continue;
-      
-      HasUse |= MI->getOperand(i).isUse();
-      HasDef |= MI->getOperand(i).isDef();
-      
-      Indices.push_back(i);
-    }
-    
-    if (!tryFoldMemoryOperand(MI, vrm, NULL, getInstructionIndex(MI),
-                              Indices, true, slot, li.reg)) {
-      unsigned NewVReg = mri_->createVirtualRegister(rc);
-      vrm.grow();
-      vrm.assignVirt2StackSlot(NewVReg, slot);
-      
-      // create a new register for this spill
-      LiveInterval &nI = getOrCreateInterval(NewVReg);
-      nI.markNotSpillable();
-      
-      // Rewrite register operands to use the new vreg.
-      for (SmallVectorImpl<unsigned>::iterator I = Indices.begin(),
-           E = Indices.end(); I != E; ++I) {
-        MI->getOperand(*I).setReg(NewVReg);
-        
-        if (MI->getOperand(*I).isUse())
-          MI->getOperand(*I).setIsKill(true);
-      }
-      
-      // Fill in  the new live interval.
-      SlotIndex index = getInstructionIndex(MI);
-      if (HasUse) {
-        LiveRange LR(index.getLoadIndex(), index.getUseIndex(),
-                     nI.getNextValue(SlotIndex(), 0, false,
-                                     getVNInfoAllocator()));
-        DEBUG(dbgs() << " +" << LR);
-        nI.addRange(LR);
-        vrm.addRestorePoint(NewVReg, MI);
-      }
-      if (HasDef) {
-        LiveRange LR(index.getDefIndex(), index.getStoreIndex(),
-                     nI.getNextValue(SlotIndex(), 0, false,
-                                     getVNInfoAllocator()));
-        DEBUG(dbgs() << " +" << LR);
-        nI.addRange(LR);
-        vrm.addSpillPoint(NewVReg, true, MI);
-      }
-      
-      added.push_back(&nI);
-        
-      DEBUG({
-          dbgs() << "\t\t\t\tadded new interval: ";
-          nI.dump();
-          dbgs() << '\n';
-        });
-    }
-    
-    
-    RI = mri_->reg_begin(li.reg);
-  }
-
-  return added;
-}
-
 std::vector<LiveInterval*> LiveIntervals::
 addIntervalsForSpills(const LiveInterval &li,
                       SmallVectorImpl<LiveInterval*> &SpillIs,
                       const MachineLoopInfo *loopInfo, VirtRegMap &vrm) {
-  
-  if (EnableFastSpilling)
-    return addIntervalsForSpillsFast(li, loopInfo, vrm);
-  
   assert(li.isSpillable() && "attempt to spill already spilled interval!");
 
   DEBUG({
@@ -2184,7 +2009,6 @@ LiveRange LiveIntervals::addLiveRangeToEndOfBlock(unsigned reg,
     SlotIndex(getInstructionIndex(startInst).getDefIndex()),
     startInst, true, getVNInfoAllocator());
   VN->setHasPHIKill(true);
-  VN->kills.push_back(indexes_->getTerminatorGap(startInst->getParent()));
   LiveRange LR(
      SlotIndex(getInstructionIndex(startInst).getDefIndex()),
      getMBBEndIdx(startInst->getParent()), VN);
diff --git a/lib/CodeGen/LiveStackAnalysis.cpp b/lib/CodeGen/LiveStackAnalysis.cpp
index 798b9b939cd3..709e2c6d5ca7 100644
--- a/lib/CodeGen/LiveStackAnalysis.cpp
+++ b/lib/CodeGen/LiveStackAnalysis.cpp
@@ -35,8 +35,8 @@ void LiveStacks::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 void LiveStacks::releaseMemory() {
-  // Release VNInfo memroy regions after all VNInfo objects are dtor'd.
-  VNInfoAllocator.DestroyAll();
+  // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
+  VNInfoAllocator.Reset();
   S2IMap.clear();
   S2RCMap.clear();
 }
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 079684eea079..41b891d30f23 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -286,7 +286,7 @@ MachineInstr *LiveVariables::FindLastRefOrPartRef(unsigned Reg) {
   MachineInstr *LastDef = PhysRegDef[Reg];
   MachineInstr *LastUse = PhysRegUse[Reg];
   if (!LastDef && !LastUse)
-    return false;
+    return 0;
 
   MachineInstr *LastRefOrPartRef = LastUse ? LastUse : LastDef;
   unsigned LastRefOrPartRefDist = DistanceMap[LastRefOrPartRef];
@@ -609,7 +609,12 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
 
     // Finally, if the last instruction in the block is a return, make sure to
     // mark it as using all of the live-out values in the function.
-    if (!MBB->empty() && MBB->back().getDesc().isReturn()) {
+    // Things marked both call and return are tail calls; do not do this for
+    // them.  The tail callee need not take the same registers as input
+    // that it produces as output, and there are dependencies for its input
+    // registers elsewhere.
+    if (!MBB->empty() && MBB->back().getDesc().isReturn()
+        && !MBB->back().getDesc().isCall()) {
       MachineInstr *Ret = &MBB->back();
 
       for (MachineRegisterInfo::liveout_iterator
diff --git a/lib/CodeGen/LowerSubregs.cpp b/lib/CodeGen/LowerSubregs.cpp
index b0348a5b753c..dfd4eaeca660 100644
--- a/lib/CodeGen/LowerSubregs.cpp
+++ b/lib/CodeGen/LowerSubregs.cpp
@@ -53,15 +53,15 @@ namespace {
     bool runOnMachineFunction(MachineFunction&);
 
   private:
-    bool LowerExtract(MachineInstr *MI);
-    bool LowerInsert(MachineInstr *MI);
     bool LowerSubregToReg(MachineInstr *MI);
+    bool LowerCopy(MachineInstr *MI);
 
     void TransferDeadFlag(MachineInstr *MI, unsigned DstReg,
                           const TargetRegisterInfo *TRI);
     void TransferKillFlag(MachineInstr *MI, unsigned SrcReg,
                           const TargetRegisterInfo *TRI,
                           bool AddIfNotFound = false);
+    void TransferImplicitDefs(MachineInstr *MI);
   };
 
   char LowerSubregsInstructionPass::ID = 0;
@@ -83,7 +83,7 @@ LowerSubregsInstructionPass::TransferDeadFlag(MachineInstr *MI,
     if (MII->addRegisterDead(DstReg, TRI))
       break;
     assert(MII != MI->getParent()->begin() &&
-           "copyRegToReg output doesn't reference destination register!");
+           "copyPhysReg output doesn't reference destination register!");
   }
 }
 
@@ -100,64 +100,24 @@ LowerSubregsInstructionPass::TransferKillFlag(MachineInstr *MI,
     if (MII->addRegisterKilled(SrcReg, TRI, AddIfNotFound))
       break;
     assert(MII != MI->getParent()->begin() &&
-           "copyRegToReg output doesn't reference source register!");
+           "copyPhysReg output doesn't reference source register!");
   }
 }
 
-bool LowerSubregsInstructionPass::LowerExtract(MachineInstr *MI) {
-  MachineBasicBlock *MBB = MI->getParent();
-
-  assert(MI->getOperand(0).isReg() && MI->getOperand(0).isDef() &&
-         MI->getOperand(1).isReg() && MI->getOperand(1).isUse() &&
-         MI->getOperand(2).isImm() && "Malformed extract_subreg");
-
-  unsigned DstReg   = MI->getOperand(0).getReg();
-  unsigned SuperReg = MI->getOperand(1).getReg();
-  unsigned SubIdx   = MI->getOperand(2).getImm();
-  unsigned SrcReg   = TRI->getSubReg(SuperReg, SubIdx);
-
-  assert(TargetRegisterInfo::isPhysicalRegister(SuperReg) &&
-         "Extract supperg source must be a physical register");
-  assert(TargetRegisterInfo::isPhysicalRegister(DstReg) &&
-         "Extract destination must be in a physical register");
-  assert(SrcReg && "invalid subregister index for register");
-
-  DEBUG(dbgs() << "subreg: CONVERTING: " << *MI);
-
-  if (SrcReg == DstReg) {
-    // No need to insert an identity copy instruction.
-    if (MI->getOperand(1).isKill()) {
-      // We must make sure the super-register gets killed. Replace the
-      // instruction with KILL.
-      MI->setDesc(TII->get(TargetOpcode::KILL));
-      MI->RemoveOperand(2);     // SubIdx
-      DEBUG(dbgs() << "subreg: replace by: " << *MI);
-      return true;
-    }
-
-    DEBUG(dbgs() << "subreg: eliminated!");
-  } else {
-    // Insert copy
-    const TargetRegisterClass *TRCS = TRI->getPhysicalRegisterRegClass(DstReg);
-    const TargetRegisterClass *TRCD = TRI->getPhysicalRegisterRegClass(SrcReg);
-    bool Emitted = TII->copyRegToReg(*MBB, MI, DstReg, SrcReg, TRCD, TRCS,
-                                     MI->getDebugLoc());
-    (void)Emitted;
-    assert(Emitted && "Subreg and Dst must be of compatible register class");
-    // Transfer the kill/dead flags, if needed.
-    if (MI->getOperand(0).isDead())
-      TransferDeadFlag(MI, DstReg, TRI);
-    if (MI->getOperand(1).isKill())
-      TransferKillFlag(MI, SuperReg, TRI, true);
-    DEBUG({
-        MachineBasicBlock::iterator dMI = MI;
-        dbgs() << "subreg: " << *(--dMI);
-      });
+/// TransferImplicitDefs - MI is a pseudo-instruction, and the lowered
+/// replacement instructions immediately precede it.  Copy any implicit-def
+/// operands from MI to the replacement instruction.
+void
+LowerSubregsInstructionPass::TransferImplicitDefs(MachineInstr *MI) {
+  MachineBasicBlock::iterator CopyMI = MI;
+  --CopyMI;
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isImplicit() || MO.isUse())
+      continue;
+    CopyMI->addOperand(MachineOperand::CreateReg(MO.getReg(), true, true));
   }
-
-  DEBUG(dbgs() << '\n');
-  MBB->erase(MI);
-  return true;
 }
 
 bool LowerSubregsInstructionPass::LowerSubregToReg(MachineInstr *MI) {
@@ -166,10 +126,10 @@ bool LowerSubregsInstructionPass::LowerSubregToReg(MachineInstr *MI) {
          MI->getOperand(1).isImm() &&
          (MI->getOperand(2).isReg() && MI->getOperand(2).isUse()) &&
           MI->getOperand(3).isImm() && "Invalid subreg_to_reg");
-          
+
   unsigned DstReg  = MI->getOperand(0).getReg();
   unsigned InsReg  = MI->getOperand(2).getReg();
-  unsigned InsSIdx = MI->getOperand(2).getSubReg();
+  assert(!MI->getOperand(2).getSubReg() && "SubIdx on physreg?");
   unsigned SubIdx  = MI->getOperand(3).getImm();
 
   assert(SubIdx != 0 && "Invalid index for insert_subreg");
@@ -182,27 +142,25 @@ bool LowerSubregsInstructionPass::LowerSubregToReg(MachineInstr *MI) {
 
   DEBUG(dbgs() << "subreg: CONVERTING: " << *MI);
 
-  if (DstSubReg == InsReg && InsSIdx == 0) {
+  if (DstSubReg == InsReg) {
     // No need to insert an identify copy instruction.
     // Watch out for case like this:
-    // %RAX<def> = ...
-    // %RAX<def> = SUBREG_TO_REG 0, %EAX:3<kill>, 3
-    // The first def is defining RAX, not EAX so the top bits were not
-    // zero extended.
+    // %RAX<def> = SUBREG_TO_REG 0, %EAX<kill>, 3
+    // We must leave %RAX live.
+    if (DstReg != InsReg) {
+      MI->setDesc(TII->get(TargetOpcode::KILL));
+      MI->RemoveOperand(3);     // SubIdx
+      MI->RemoveOperand(1);     // Imm
+      DEBUG(dbgs() << "subreg: replace by: " << *MI);
+      return true;
+    }
     DEBUG(dbgs() << "subreg: eliminated!");
   } else {
-    // Insert sub-register copy
-    const TargetRegisterClass *TRC0= TRI->getPhysicalRegisterRegClass(DstSubReg);
-    const TargetRegisterClass *TRC1= TRI->getPhysicalRegisterRegClass(InsReg);
-    bool Emitted = TII->copyRegToReg(*MBB, MI, DstSubReg, InsReg, TRC0, TRC1,
-                                     MI->getDebugLoc());
-    (void)Emitted;
-    assert(Emitted && "Subreg and Dst must be of compatible register class");
+    TII->copyPhysReg(*MBB, MI, MI->getDebugLoc(), DstSubReg, InsReg,
+                     MI->getOperand(2).isKill());
     // Transfer the kill/dead flags, if needed.
     if (MI->getOperand(0).isDead())
       TransferDeadFlag(MI, DstSubReg, TRI);
-    if (MI->getOperand(2).isKill())
-      TransferKillFlag(MI, InsReg, TRI);
     DEBUG({
         MachineBasicBlock::iterator dMI = MI;
         dbgs() << "subreg: " << *(--dMI);
@@ -214,87 +172,39 @@ bool LowerSubregsInstructionPass::LowerSubregToReg(MachineInstr *MI) {
   return true;
 }
 
-bool LowerSubregsInstructionPass::LowerInsert(MachineInstr *MI) {
-  MachineBasicBlock *MBB = MI->getParent();
-  assert((MI->getOperand(0).isReg() && MI->getOperand(0).isDef()) &&
-         (MI->getOperand(1).isReg() && MI->getOperand(1).isUse()) &&
-         (MI->getOperand(2).isReg() && MI->getOperand(2).isUse()) &&
-          MI->getOperand(3).isImm() && "Invalid insert_subreg");
-          
-  unsigned DstReg = MI->getOperand(0).getReg();
-#ifndef NDEBUG
-  unsigned SrcReg = MI->getOperand(1).getReg();
-#endif
-  unsigned InsReg = MI->getOperand(2).getReg();
-  unsigned SubIdx = MI->getOperand(3).getImm();     
+bool LowerSubregsInstructionPass::LowerCopy(MachineInstr *MI) {
+  MachineOperand &DstMO = MI->getOperand(0);
+  MachineOperand &SrcMO = MI->getOperand(1);
 
-  assert(DstReg == SrcReg && "insert_subreg not a two-address instruction?");
-  assert(SubIdx != 0 && "Invalid index for insert_subreg");
-  unsigned DstSubReg = TRI->getSubReg(DstReg, SubIdx);
-  assert(DstSubReg && "invalid subregister index for register");
-  assert(TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
-         "Insert superreg source must be in a physical register");
-  assert(TargetRegisterInfo::isPhysicalRegister(InsReg) &&
-         "Inserted value must be in a physical register");
-
-  DEBUG(dbgs() << "subreg: CONVERTING: " << *MI);
-
-  if (DstSubReg == InsReg) {
-    // No need to insert an identity copy instruction. If the SrcReg was
-    // <undef>, we need to make sure it is alive by inserting a KILL
-    if (MI->getOperand(1).isUndef() && !MI->getOperand(0).isDead()) {
-      MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
-                                TII->get(TargetOpcode::KILL), DstReg);
-      if (MI->getOperand(2).isUndef())
-        MIB.addReg(InsReg, RegState::Undef);
-      else
-        MIB.addReg(InsReg, RegState::Kill);
-    } else {
-      DEBUG(dbgs() << "subreg: eliminated!\n");
-      MBB->erase(MI);
+  if (SrcMO.getReg() == DstMO.getReg()) {
+    DEBUG(dbgs() << "identity copy: " << *MI);
+    // No need to insert an identity copy instruction, but replace with a KILL
+    // if liveness is changed.
+    if (DstMO.isDead() || SrcMO.isUndef() || MI->getNumOperands() > 2) {
+      // We must make sure the super-register gets killed. Replace the
+      // instruction with KILL.
+      MI->setDesc(TII->get(TargetOpcode::KILL));
+      DEBUG(dbgs() << "replaced by:   " << *MI);
       return true;
     }
-  } else {
-    // Insert sub-register copy
-    const TargetRegisterClass *TRC0= TRI->getPhysicalRegisterRegClass(DstSubReg);
-    const TargetRegisterClass *TRC1= TRI->getPhysicalRegisterRegClass(InsReg);
-    if (MI->getOperand(2).isUndef())
-      // If the source register being inserted is undef, then this becomes a
-      // KILL.
-      BuildMI(*MBB, MI, MI->getDebugLoc(),
-              TII->get(TargetOpcode::KILL), DstSubReg);
-    else {
-      bool Emitted = TII->copyRegToReg(*MBB, MI, DstSubReg, InsReg, TRC0, TRC1,
-                                       MI->getDebugLoc());
-      (void)Emitted;
-      assert(Emitted && "Subreg and Dst must be of compatible register class");
-    }
-    MachineBasicBlock::iterator CopyMI = MI;
-    --CopyMI;
-
-    // INSERT_SUBREG is a two-address instruction so it implicitly kills SrcReg.
-    if (!MI->getOperand(1).isUndef())
-      CopyMI->addOperand(MachineOperand::CreateReg(DstReg, false, true, true));
-
-    // Transfer the kill/dead flags, if needed.
-    if (MI->getOperand(0).isDead()) {
-      TransferDeadFlag(MI, DstSubReg, TRI);
-    } else {
-      // Make sure the full DstReg is live after this replacement.
-      CopyMI->addOperand(MachineOperand::CreateReg(DstReg, true, true));
-    }
-
-    // Make sure the inserted register gets killed
-    if (MI->getOperand(2).isKill() && !MI->getOperand(2).isUndef())
-      TransferKillFlag(MI, InsReg, TRI);
+    // Vanilla identity copy.
+    MI->eraseFromParent();
+    return true;
   }
 
-  DEBUG({
-      MachineBasicBlock::iterator dMI = MI;
-      dbgs() << "subreg: " << *(--dMI) << "\n";
-    });
+  DEBUG(dbgs() << "real copy:   " << *MI);
+  TII->copyPhysReg(*MI->getParent(), MI, MI->getDebugLoc(),
+                   DstMO.getReg(), SrcMO.getReg(), SrcMO.isKill());
 
-  MBB->erase(MI);
+  if (DstMO.isDead())
+    TransferDeadFlag(MI, DstMO.getReg(), TRI);
+  if (MI->getNumOperands() > 2)
+    TransferImplicitDefs(MI);
+  DEBUG({
+    MachineBasicBlock::iterator dMI = MI;
+    dbgs() << "replaced by: " << *(--dMI);
+  });
+  MI->eraseFromParent();
   return true;
 }
 
@@ -317,12 +227,13 @@ bool LowerSubregsInstructionPass::runOnMachineFunction(MachineFunction &MF) {
          mi != me;) {
       MachineBasicBlock::iterator nmi = llvm::next(mi);
       MachineInstr *MI = mi;
-      if (MI->isExtractSubreg()) {
-        MadeChange |= LowerExtract(MI);
-      } else if (MI->isInsertSubreg()) {
-        MadeChange |= LowerInsert(MI);
-      } else if (MI->isSubregToReg()) {
+      assert(!MI->isInsertSubreg() && "INSERT_SUBREG should no longer appear");
+      assert(MI->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+             "EXTRACT_SUBREG should no longer appear");
+      if (MI->isSubregToReg()) {
         MadeChange |= LowerSubregToReg(MI);
+      } else if (MI->isCopy()) {
+        MadeChange |= LowerCopy(MI);
       }
       mi = nmi;
     }
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index eaaa1f85b563..a27ee479433b 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -13,7 +13,10 @@
 
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/BasicBlock.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -136,6 +139,13 @@ void ilist_traits<MachineInstr>::deleteNode(MachineInstr* MI) {
   Parent->getParent()->DeleteMachineInstr(MI);
 }
 
+MachineBasicBlock::iterator MachineBasicBlock::getFirstNonPHI() {
+  iterator I = begin();
+  while (I != end() && I->isPHI())
+    ++I;
+  return I;
+}
+
 MachineBasicBlock::iterator MachineBasicBlock::getFirstTerminator() {
   iterator I = end();
   while (I != begin() && (--I)->getDesc().isTerminator())
@@ -245,6 +255,7 @@ void MachineBasicBlock::updateTerminator() {
 
   MachineBasicBlock *TBB = 0, *FBB = 0;
   SmallVector<MachineOperand, 4> Cond;
+  DebugLoc dl;  // FIXME: this is nowhere
   bool B = TII->AnalyzeBranch(*this, TBB, FBB, Cond);
   (void) B;
   assert(!B && "UpdateTerminators requires analyzable predecessors!");
@@ -259,7 +270,7 @@ void MachineBasicBlock::updateTerminator() {
       // its layout successor, insert a branch.
       TBB = *succ_begin();
       if (!isLayoutSuccessor(TBB))
-        TII->InsertBranch(*this, TBB, 0, Cond);
+        TII->InsertBranch(*this, TBB, 0, Cond, dl);
     }
   } else {
     if (FBB) {
@@ -270,10 +281,10 @@ void MachineBasicBlock::updateTerminator() {
         if (TII->ReverseBranchCondition(Cond))
           return;
         TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, FBB, 0, Cond);
+        TII->InsertBranch(*this, FBB, 0, Cond, dl);
       } else if (isLayoutSuccessor(FBB)) {
         TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, TBB, 0, Cond);
+        TII->InsertBranch(*this, TBB, 0, Cond, dl);
       }
     } else {
       // The block has a fallthrough conditional branch.
@@ -284,14 +295,14 @@ void MachineBasicBlock::updateTerminator() {
         if (TII->ReverseBranchCondition(Cond)) {
           // We can't reverse the condition, add an unconditional branch.
           Cond.clear();
-          TII->InsertBranch(*this, MBBA, 0, Cond);
+          TII->InsertBranch(*this, MBBA, 0, Cond, dl);
           return;
         }
         TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, MBBA, 0, Cond);
+        TII->InsertBranch(*this, MBBA, 0, Cond, dl);
       } else if (!isLayoutSuccessor(MBBA)) {
         TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, TBB, MBBA, Cond);
+        TII->InsertBranch(*this, TBB, MBBA, Cond, dl);
       }
     }
   }
@@ -331,12 +342,32 @@ void MachineBasicBlock::transferSuccessors(MachineBasicBlock *fromMBB) {
   if (this == fromMBB)
     return;
   
-  for (MachineBasicBlock::succ_iterator I = fromMBB->succ_begin(), 
-       E = fromMBB->succ_end(); I != E; ++I)
-    addSuccessor(*I);
+  while (!fromMBB->succ_empty()) {
+    MachineBasicBlock *Succ = *fromMBB->succ_begin();
+    addSuccessor(Succ);
+    fromMBB->removeSuccessor(Succ);
+  }
+}
+
+void
+MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *fromMBB) {
+  if (this == fromMBB)
+    return;
   
-  while (!fromMBB->succ_empty())
-    fromMBB->removeSuccessor(fromMBB->succ_begin());
+  while (!fromMBB->succ_empty()) {
+    MachineBasicBlock *Succ = *fromMBB->succ_begin();
+    addSuccessor(Succ);
+    fromMBB->removeSuccessor(Succ);
+
+    // Fix up any PHI nodes in the successor.
+    for (MachineBasicBlock::iterator MI = Succ->begin(), ME = Succ->end();
+         MI != ME && MI->isPHI(); ++MI)
+      for (unsigned i = 2, e = MI->getNumOperands()+1; i != e; i += 2) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (MO.getMBB() == fromMBB)
+          MO.setMBB(this);
+      }
+  }
 }
 
 bool MachineBasicBlock::isSuccessor(const MachineBasicBlock *MBB) const {
@@ -395,6 +426,82 @@ bool MachineBasicBlock::canFallThrough() {
   return FBB == 0;
 }
 
+MachineBasicBlock *
+MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
+  MachineFunction *MF = getParent();
+  DebugLoc dl;  // FIXME: this is nowhere
+
+  // We may need to update this's terminator, but we can't do that if AnalyzeBranch
+  // fails. If this uses a jump table, we won't touch it.
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  if (TII->AnalyzeBranch(*this, TBB, FBB, Cond))
+    return NULL;
+
+  MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock();
+  MF->insert(llvm::next(MachineFunction::iterator(this)), NMBB);
+  DEBUG(dbgs() << "PHIElimination splitting critical edge:"
+        " BB#" << getNumber()
+        << " -- BB#" << NMBB->getNumber()
+        << " -- BB#" << Succ->getNumber() << '\n');
+
+  ReplaceUsesOfBlockWith(Succ, NMBB);
+  updateTerminator();
+
+  // Insert unconditional "jump Succ" instruction in NMBB if necessary.
+  NMBB->addSuccessor(Succ);
+  if (!NMBB->isLayoutSuccessor(Succ)) {
+    Cond.clear();
+    MF->getTarget().getInstrInfo()->InsertBranch(*NMBB, Succ, NULL, Cond, dl);
+  }
+
+  // Fix PHI nodes in Succ so they refer to NMBB instead of this
+  for (MachineBasicBlock::iterator i = Succ->begin(), e = Succ->end();
+       i != e && i->isPHI(); ++i)
+    for (unsigned ni = 1, ne = i->getNumOperands(); ni != ne; ni += 2)
+      if (i->getOperand(ni+1).getMBB() == this)
+        i->getOperand(ni+1).setMBB(NMBB);
+
+  if (LiveVariables *LV =
+        P->getAnalysisIfAvailable<LiveVariables>())
+    LV->addNewBlock(NMBB, this, Succ);
+
+  if (MachineDominatorTree *MDT =
+        P->getAnalysisIfAvailable<MachineDominatorTree>())
+    MDT->addNewBlock(NMBB, this);
+
+  if (MachineLoopInfo *MLI =
+        P->getAnalysisIfAvailable<MachineLoopInfo>())
+    if (MachineLoop *TIL = MLI->getLoopFor(this)) {
+      // If one or the other blocks were not in a loop, the new block is not
+      // either, and thus LI doesn't need to be updated.
+      if (MachineLoop *DestLoop = MLI->getLoopFor(Succ)) {
+        if (TIL == DestLoop) {
+          // Both in the same loop, the NMBB joins loop.
+          DestLoop->addBasicBlockToLoop(NMBB, MLI->getBase());
+        } else if (TIL->contains(DestLoop)) {
+          // Edge from an outer loop to an inner loop.  Add to the outer loop.
+          TIL->addBasicBlockToLoop(NMBB, MLI->getBase());
+        } else if (DestLoop->contains(TIL)) {
+          // Edge from an inner loop to an outer loop.  Add to the outer loop.
+          DestLoop->addBasicBlockToLoop(NMBB, MLI->getBase());
+        } else {
+          // Edge from two loops with no containment relation.  Because these
+          // are natural loops, we know that the destination block must be the
+          // header of its loop (adding a branch into a loop elsewhere would
+          // create an irreducible loop).
+          assert(DestLoop->getHeader() == Succ &&
+                 "Should not create irreducible loops!");
+          if (MachineLoop *P = DestLoop->getParentLoop())
+            P->addBasicBlockToLoop(NMBB, MLI->getBase());
+        }
+      }
+    }
+
+  return NMBB;
+}
+
 /// removeFromParent - This method unlinks 'this' from the containing function,
 /// and returns it, but does not delete it.
 MachineBasicBlock *MachineBasicBlock::removeFromParent() {
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index 6f4f7a883409..833cc00027db 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -30,9 +30,7 @@ using namespace llvm;
 
 STATISTIC(NumCoalesces, "Number of copies coalesced");
 STATISTIC(NumCSEs,      "Number of common subexpression eliminated");
-
-static cl::opt<bool> CSEPhysDef("machine-cse-phys-defs",
-                                cl::init(false), cl::Hidden);
+STATISTIC(NumPhysCSEs,  "Number of phyreg defining common subexpr eliminated");
 
 namespace {
   class MachineCSE : public MachineFunctionPass {
@@ -128,6 +126,28 @@ bool MachineCSE::PerformTrivialCoalescing(MachineInstr *MI,
       ++NumCoalesces;
       Changed = true;
     }
+
+    if (!DefMI->isCopy())
+      continue;
+    SrcReg = DefMI->getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+      continue;
+    if (DefMI->getOperand(0).getSubReg() || DefMI->getOperand(1).getSubReg())
+      continue;
+    const TargetRegisterClass *SRC   = MRI->getRegClass(SrcReg);
+    const TargetRegisterClass *RC    = MRI->getRegClass(Reg);
+    const TargetRegisterClass *NewRC = getCommonSubClass(RC, SRC);
+    if (!NewRC)
+      continue;
+    DEBUG(dbgs() << "Coalescing: " << *DefMI);
+    DEBUG(dbgs() << "*** to: " << *MI);
+    MO.setReg(SrcReg);
+    MRI->clearKillFlags(SrcReg);
+    if (NewRC != SRC)
+      MRI->setRegClass(SrcReg, NewRC);
+    DefMI->eraseFromParent();
+    ++NumCoalesces;
+    Changed = true;
   }
 
   return Changed;
@@ -172,7 +192,8 @@ MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
 
 /// hasLivePhysRegDefUse - Return true if the specified instruction read / write
 /// physical registers (except for dead defs of physical registers). It also
-/// returns the physical register def by reference if it's the only one.
+/// returns the physical register def by reference if it's the only one and the
+/// instruction does not uses a physical register.
 bool MachineCSE::hasLivePhysRegDefUse(const MachineInstr *MI,
                                       const MachineBasicBlock *MBB,
                                       unsigned &PhysDef) const {
@@ -186,9 +207,11 @@ bool MachineCSE::hasLivePhysRegDefUse(const MachineInstr *MI,
       continue;
     if (TargetRegisterInfo::isVirtualRegister(Reg))
       continue;
-    if (MO.isUse())
+    if (MO.isUse()) {
       // Can't touch anything to read a physical register.
+      PhysDef = 0;
       return true;
+    }
     if (MO.isDead())
       // If the def is dead, it's ok.
       continue;
@@ -240,8 +263,8 @@ bool MachineCSE::PhysRegDefReaches(MachineInstr *CSMI, MachineInstr *MI,
 
 static bool isCopy(const MachineInstr *MI, const TargetInstrInfo *TII) {
   unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-  return TII->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx) ||
-    MI->isExtractSubreg() || MI->isInsertSubreg() || MI->isSubregToReg();
+  return MI->isCopyLike() ||
+    TII->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx);
 }
 
 bool MachineCSE::isCSECandidate(MachineInstr *MI) {
@@ -356,6 +379,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
     if (!isCSECandidate(MI))
       continue;
 
+    bool DefPhys = false;
     bool FoundCSE = VNT.count(MI);
     if (!FoundCSE) {
       // Look for trivial copy coalescing opportunities.
@@ -376,11 +400,13 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
 
       // ... Unless the CS is local and it also defines the physical register
       // which is not clobbered in between.
-      if (PhysDef && CSEPhysDef) {
+      if (PhysDef) {
         unsigned CSVN = VNT.lookup(MI);
         MachineInstr *CSMI = Exps[CSVN];
-        if (PhysRegDefReaches(CSMI, MI, PhysDef))
+        if (PhysRegDefReaches(CSMI, MI, PhysDef)) {
           FoundCSE = true;
+          DefPhys = true;
+        }
       }
     }
 
@@ -426,6 +452,8 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
       }
       MI->eraseFromParent();
       ++NumCSEs;
+      if (DefPhys)
+        ++NumPhysCSEs;
     } else {
       DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
       VNT.insert(MI, CurrVN++);
diff --git a/lib/CodeGen/MachineDominators.cpp b/lib/CodeGen/MachineDominators.cpp
index 408873903b0d..b5f8fbba99de 100644
--- a/lib/CodeGen/MachineDominators.cpp
+++ b/lib/CodeGen/MachineDominators.cpp
@@ -46,7 +46,6 @@ MachineDominatorTree::MachineDominatorTree()
 }
 
 MachineDominatorTree::~MachineDominatorTree() {
-  DT->releaseMemory();
   delete DT;
 }
 
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index a38c881982e7..666120f032c6 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -378,7 +378,7 @@ void MachineFunction::viewCFG() const
 #ifndef NDEBUG
   ViewGraph(this, "mf" + getFunction()->getNameStr());
 #else
-  errs() << "SelectionDAG::viewGraph is only available in debug builds on "
+  errs() << "MachineFunction::viewCFG is only available in debug builds on "
          << "systems with Graphviz or gv!\n";
 #endif // NDEBUG
 }
@@ -388,7 +388,7 @@ void MachineFunction::viewCFGOnly() const
 #ifndef NDEBUG
   ViewGraph(this, "mf" + getFunction()->getNameStr(), true);
 #else
-  errs() << "SelectionDAG::viewGraph is only available in debug builds on "
+  errs() << "MachineFunction::viewCFGOnly is only available in debug builds on "
          << "systems with Graphviz or gv!\n";
 #endif // NDEBUG
 }
@@ -438,10 +438,16 @@ MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx,
 /// index with a negative value.
 ///
 int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
-                                        bool Immutable, bool isSS) {
+                                        bool Immutable) {
   assert(Size != 0 && "Cannot allocate zero size fixed stack objects!");
-  Objects.insert(Objects.begin(), StackObject(Size, 1, SPOffset, Immutable,
-                                              isSS));
+  // The alignment of the frame index can be determined from its offset from
+  // the incoming frame position.  If the frame object is at offset 32 and
+  // the stack is guaranteed to be 16-byte aligned, then we know that the
+  // object is 16-byte aligned.
+  unsigned StackAlign = TFI.getStackAlignment();
+  unsigned Align = MinAlign(SPOffset, StackAlign);
+  Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable,
+                                              /*isSS*/false));
   return -++NumFixedObjects;
 }
 
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index e54cd5cf9492..6b2e98549c71 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -111,6 +111,26 @@ void MachineOperand::setReg(unsigned Reg) {
   Contents.Reg.RegNo = Reg;
 }
 
+void MachineOperand::substVirtReg(unsigned Reg, unsigned SubIdx,
+                                  const TargetRegisterInfo &TRI) {
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  if (SubIdx && getSubReg())
+    SubIdx = TRI.composeSubRegIndices(SubIdx, getSubReg());
+  setReg(Reg);
+  if (SubIdx)
+    setSubReg(SubIdx);
+}
+
+void MachineOperand::substPhysReg(unsigned Reg, const TargetRegisterInfo &TRI) {
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+  if (getSubReg()) {
+    Reg = TRI.getSubReg(Reg, getSubReg());
+    assert(Reg && "Invalid SubReg for physical register");
+    setSubReg(0);
+  }
+  setReg(Reg);
+}
+
 /// ChangeToImmediate - Replace this operand with a new immediate operand of
 /// the specified value.  If an operand is known to be an immediate already,
 /// the setImm method should be used.
@@ -861,14 +881,14 @@ int MachineInstr::findFirstPredOperandIdx() const {
 bool MachineInstr::
 isRegTiedToUseOperand(unsigned DefOpIdx, unsigned *UseOpIdx) const {
   if (isInlineAsm()) {
-    assert(DefOpIdx >= 2);
+    assert(DefOpIdx >= 3);
     const MachineOperand &MO = getOperand(DefOpIdx);
     if (!MO.isReg() || !MO.isDef() || MO.getReg() == 0)
       return false;
     // Determine the actual operand index that corresponds to this index.
     unsigned DefNo = 0;
     unsigned DefPart = 0;
-    for (unsigned i = 1, e = getNumOperands(); i < e; ) {
+    for (unsigned i = 2, e = getNumOperands(); i < e; ) {
       const MachineOperand &FMO = getOperand(i);
       // After the normal asm operands there may be additional imp-def regs.
       if (!FMO.isImm())
@@ -883,7 +903,7 @@ isRegTiedToUseOperand(unsigned DefOpIdx, unsigned *UseOpIdx) const {
       }
       ++DefNo;
     }
-    for (unsigned i = 1, e = getNumOperands(); i != e; ++i) {
+    for (unsigned i = 2, e = getNumOperands(); i != e; ++i) {
       const MachineOperand &FMO = getOperand(i);
       if (!FMO.isImm())
         continue;
@@ -926,7 +946,7 @@ isRegTiedToDefOperand(unsigned UseOpIdx, unsigned *DefOpIdx) const {
 
     // Find the flag operand corresponding to UseOpIdx
     unsigned FlagIdx, NumOps=0;
-    for (FlagIdx = 1; FlagIdx < UseOpIdx; FlagIdx += NumOps+1) {
+    for (FlagIdx = 2; FlagIdx < UseOpIdx; FlagIdx += NumOps+1) {
       const MachineOperand &UFMO = getOperand(FlagIdx);
       // After the normal asm operands there may be additional imp-def regs.
       if (!UFMO.isImm())
@@ -944,9 +964,9 @@ isRegTiedToDefOperand(unsigned UseOpIdx, unsigned *DefOpIdx) const {
       if (!DefOpIdx)
         return true;
 
-      unsigned DefIdx = 1;
-      // Remember to adjust the index. First operand is asm string, then there
-      // is a flag for each.
+      unsigned DefIdx = 2;
+      // Remember to adjust the index. First operand is asm string, second is
+      // the AlignStack bit, then there is a flag for each.
       while (DefNo) {
         const MachineOperand &FMO = getOperand(DefIdx);
         assert(FMO.isImm());
@@ -1017,6 +1037,29 @@ void MachineInstr::copyPredicates(const MachineInstr *MI) {
   }
 }
 
+void MachineInstr::substituteRegister(unsigned FromReg,
+                                      unsigned ToReg,
+                                      unsigned SubIdx,
+                                      const TargetRegisterInfo &RegInfo) {
+  if (TargetRegisterInfo::isPhysicalRegister(ToReg)) {
+    if (SubIdx)
+      ToReg = RegInfo.getSubReg(ToReg, SubIdx);
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = getOperand(i);
+      if (!MO.isReg() || MO.getReg() != FromReg)
+        continue;
+      MO.substPhysReg(ToReg, RegInfo);
+    }
+  } else {
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = getOperand(i);
+      if (!MO.isReg() || MO.getReg() != FromReg)
+        continue;
+      MO.substVirtReg(ToReg, SubIdx, RegInfo);
+    }
+  }
+}
+
 /// isSafeToMove - Return true if it is safe to move this instruction. If
 /// SawStore is set to true, it means that there is a store (or call) between
 /// the instruction's location and its intended destination.
@@ -1168,6 +1211,28 @@ void MachineInstr::dump() const {
   dbgs() << "  " << *this;
 }
 
+static void printDebugLoc(DebugLoc DL, const MachineFunction *MF, 
+                         raw_ostream &CommentOS) {
+  const LLVMContext &Ctx = MF->getFunction()->getContext();
+  if (!DL.isUnknown()) {          // Print source line info.
+    DIScope Scope(DL.getScope(Ctx));
+    // Omit the directory, because it's likely to be long and uninteresting.
+    if (Scope.Verify())
+      CommentOS << Scope.getFilename();
+    else
+      CommentOS << "<unknown>";
+    CommentOS << ':' << DL.getLine();
+    if (DL.getCol() != 0)
+      CommentOS << ':' << DL.getCol();
+    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(DL.getInlinedAt(Ctx));
+    if (!InlinedAtDL.isUnknown()) {
+      CommentOS << " @[ ";
+      printDebugLoc(InlinedAtDL, MF, CommentOS);
+      CommentOS << " ]";
+    }
+  }
+}
+
 void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
   // We can be a bit tidier if we know the TargetMachine and/or MachineFunction.
   const MachineFunction *MF = 0;
@@ -1240,6 +1305,8 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
         OS << "!\"" << MDS->getString() << '\"';
       else
         MO.print(OS, TM);
+    } else if (TM && (isInsertSubreg() || isRegSequence()) && MO.isImm()) {
+      OS << TM->getRegisterInfo()->getSubRegIndexName(MO.getImm());
     } else
       MO.print(OS, TM);
   }
@@ -1265,19 +1332,8 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
 
   if (!debugLoc.isUnknown() && MF) {
     if (!HaveSemi) OS << ";";
-
-    // TODO: print InlinedAtLoc information
-
-    DIScope Scope(debugLoc.getScope(MF->getFunction()->getContext()));
     OS << " dbg:";
-    // Omit the directory, since it's usually long and uninteresting.
-    if (Scope.Verify())
-      OS << Scope.getFilename();
-    else
-      OS << "<unknown>";
-    OS << ':' << debugLoc.getLine();
-    if (debugLoc.getCol() != 0)
-      OS << ':' << debugLoc.getCol();
+    printDebugLoc(debugLoc, MF, OS);
   }
 
   OS << "\n";
@@ -1418,6 +1474,25 @@ void MachineInstr::addRegisterDefined(unsigned IncomingReg,
                                        true  /*IsImp*/));
 }
 
+void MachineInstr::setPhysRegsDeadExcept(const SmallVectorImpl<unsigned> &UsedRegs,
+                                         const TargetRegisterInfo &TRI) {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+    bool Dead = true;
+    for (SmallVectorImpl<unsigned>::const_iterator I = UsedRegs.begin(),
+         E = UsedRegs.end(); I != E; ++I)
+      if (TRI.regsOverlap(*I, Reg)) {
+        Dead = false;
+        break;
+      }
+    // If there are no uses, including partial uses, the def is dead.
+    if (Dead) MO.setIsDead();
+  }
+}
+
 unsigned
 MachineInstrExpressionTrait::getHashValue(const MachineInstr* const &MI) {
   unsigned Hash = MI->getOpcode() * 37;
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 61206173e645..956d21c0b34b 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -62,6 +62,7 @@ namespace {
 
     // State that is updated as we process loops
     bool         Changed;          // True if a loop is changed.
+    bool         FirstInLoop;      // True if it's the first LICM in the loop.
     MachineLoop *CurLoop;          // The current loop we are working on.
     MachineBasicBlock *CurPreheader; // The preheader for CurLoop.
 
@@ -82,7 +83,6 @@ namespace {
 
     const char *getPassName() const { return "Machine Instruction LICM"; }
 
-    // FIXME: Loop preheaders?
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
       AU.addRequired<MachineLoopInfo>();
@@ -127,8 +127,8 @@ namespace {
     void AddToLiveIns(unsigned Reg);
 
     /// IsLICMCandidate - Returns true if the instruction may be a suitable
-    /// candidate for LICM. e.g. If the instruction is a call, then it's obviously
-    /// not safe to hoist it.
+    /// candidate for LICM. e.g. If the instruction is a call, then it's
+    /// obviously not safe to hoist it.
     bool IsLICMCandidate(MachineInstr &I);
 
     /// IsLoopInvariantInst - Returns true if the instruction is loop
@@ -181,6 +181,10 @@ namespace {
     /// current loop preheader that may become duplicates of instructions that
     /// are hoisted out of the loop.
     void InitCSEMap(MachineBasicBlock *BB);
+
+    /// getCurPreheader - Get the preheader for the current loop, splitting
+    /// a critical edge if needed.
+    MachineBasicBlock *getCurPreheader();
   };
 } // end anonymous namespace
 
@@ -192,12 +196,17 @@ FunctionPass *llvm::createMachineLICMPass(bool PreRegAlloc) {
   return new MachineLICM(PreRegAlloc);
 }
 
-/// LoopIsOuterMostWithPreheader - Test if the given loop is the outer-most
-/// loop that has a preheader.
-static bool LoopIsOuterMostWithPreheader(MachineLoop *CurLoop) {
+/// LoopIsOuterMostWithPredecessor - Test if the given loop is the outer-most
+/// loop that has a unique predecessor.
+static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) {
+  // Check whether this loop even has a unique predecessor.
+  if (!CurLoop->getLoopPredecessor())
+    return false;
+  // Ok, now check to see if any of its outer loops do.
   for (MachineLoop *L = CurLoop->getParentLoop(); L; L = L->getParentLoop())
-    if (L->getLoopPreheader())
+    if (L->getLoopPredecessor())
       return false;
+  // None of them did, so this is the outermost with a unique predecessor.
   return true;
 }
 
@@ -207,7 +216,7 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
   else
     DEBUG(dbgs() << "******** Post-regalloc Machine LICM ********\n");
 
-  Changed = false;
+  Changed = FirstInLoop = false;
   TM = &MF.getTarget();
   TII = TM->getInstrInfo();
   TRI = TM->getRegisterInfo();
@@ -220,23 +229,17 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
   DT  = &getAnalysis<MachineDominatorTree>();
   AA  = &getAnalysis<AliasAnalysis>();
 
-  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end(); I != E; ++I){
-    CurLoop = *I;
+  SmallVector<MachineLoop *, 8> Worklist(MLI->begin(), MLI->end());
+  while (!Worklist.empty()) {
+    CurLoop = Worklist.pop_back_val();
+    CurPreheader = 0;
 
     // If this is done before regalloc, only visit outer-most preheader-sporting
     // loops.
-    if (PreRegAlloc && !LoopIsOuterMostWithPreheader(CurLoop))
-      continue;
-
-    // Determine the block to which to hoist instructions. If we can't find a
-    // suitable loop preheader, we can't do any hoisting.
-    //
-    // FIXME: We are only hoisting if the basic block coming into this loop
-    // has only one successor. This isn't the case in general because we haven't
-    // broken critical edges or added preheaders.
-    CurPreheader = CurLoop->getLoopPreheader();
-    if (!CurPreheader)
+    if (PreRegAlloc && !LoopIsOuterMostWithPredecessor(CurLoop)) {
+      Worklist.append(CurLoop->begin(), CurLoop->end());
       continue;
+    }
 
     if (!PreRegAlloc)
       HoistRegionPostRA();
@@ -244,6 +247,7 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
       // CSEMap is initialized for loop header when the first instruction is
       // being hoisted.
       MachineDomTreeNode *N = DT->getNode(CurLoop->getHeader());
+      FirstInLoop = true;
       HoistRegion(N);
       CSEMap.clear();
     }
@@ -436,13 +440,16 @@ void MachineLICM::AddToLiveIns(unsigned Reg) {
 /// operands that is safe to hoist, this instruction is called to do the
 /// dirty work.
 void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) {
+  MachineBasicBlock *Preheader = getCurPreheader();
+  if (!Preheader) return;
+
   // Now move the instructions to the predecessor, inserting it before any
   // terminator instructions.
   DEBUG({
       dbgs() << "Hoisting " << *MI;
-      if (CurPreheader->getBasicBlock())
+      if (Preheader->getBasicBlock())
         dbgs() << " to MachineBasicBlock "
-               << CurPreheader->getName();
+               << Preheader->getName();
       if (MI->getParent()->getBasicBlock())
         dbgs() << " from MachineBasicBlock "
                << MI->getParent()->getName();
@@ -451,7 +458,7 @@ void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) {
 
   // Splice the instruction to the preheader.
   MachineBasicBlock *MBB = MI->getParent();
-  CurPreheader->splice(CurPreheader->getFirstTerminator(), MBB, MI);
+  Preheader->splice(Preheader->getFirstTerminator(), MBB, MI);
 
   // Add register to livein list to all the BBs in the current loop since a 
   // loop invariant must be kept live throughout the whole loop. This is
@@ -490,26 +497,16 @@ void MachineLICM::HoistRegion(MachineDomTreeNode *N) {
 /// candidate for LICM. e.g. If the instruction is a call, then it's obviously
 /// not safe to hoist it.
 bool MachineLICM::IsLICMCandidate(MachineInstr &I) {
+  // It is not profitable to hoist implicitdefs.  FIXME: Why not?  what if they
+  // are an argument to some other otherwise-hoistable instruction?
   if (I.isImplicitDef())
     return false;
-
-  const TargetInstrDesc &TID = I.getDesc();
   
-  // Ignore stuff that we obviously can't hoist.
-  if (TID.mayStore() || TID.isCall() || TID.isTerminator() ||
-      TID.hasUnmodeledSideEffects())
+  // Check if it's safe to move the instruction.
+  bool DontMoveAcrossStore = true;
+  if (!I.isSafeToMove(TII, AA, DontMoveAcrossStore))
     return false;
-
-  if (TID.mayLoad()) {
-    // Okay, this instruction does a load. As a refinement, we allow the target
-    // to decide whether the loaded value is actually a constant. If so, we can
-    // actually use it as a load.
-    if (!I.isInvariantLoad(AA))
-      // FIXME: we should be able to hoist loads with no other side effects if
-      // there are no other instructions which can change memory in this loop.
-      // This is a trivial form of alias analysis.
-      return false;
-  }
+  
   return true;
 }
 
@@ -754,6 +751,9 @@ bool MachineLICM::EliminateCSE(MachineInstr *MI,
 /// that are safe to hoist, this instruction is called to do the dirty work.
 ///
 void MachineLICM::Hoist(MachineInstr *MI) {
+  MachineBasicBlock *Preheader = getCurPreheader();
+  if (!Preheader) return;
+
   // First check whether we should hoist this instruction.
   if (!IsLoopInvariantInst(*MI) || !IsProfitableToHoist(*MI)) {
     // If not, try unfolding a hoistable load.
@@ -765,9 +765,9 @@ void MachineLICM::Hoist(MachineInstr *MI) {
   // terminator instructions.
   DEBUG({
       dbgs() << "Hoisting " << *MI;
-      if (CurPreheader->getBasicBlock())
+      if (Preheader->getBasicBlock())
         dbgs() << " to MachineBasicBlock "
-               << CurPreheader->getName();
+               << Preheader->getName();
       if (MI->getParent()->getBasicBlock())
         dbgs() << " from MachineBasicBlock "
                << MI->getParent()->getName();
@@ -776,7 +776,10 @@ void MachineLICM::Hoist(MachineInstr *MI) {
 
   // If this is the first instruction being hoisted to the preheader,
   // initialize the CSE map with potential common expressions.
-  InitCSEMap(CurPreheader);
+  if (FirstInLoop) {
+    InitCSEMap(Preheader);
+    FirstInLoop = false;
+  }
 
   // Look for opportunity to CSE the hoisted instruction.
   unsigned Opcode = MI->getOpcode();
@@ -784,7 +787,7 @@ void MachineLICM::Hoist(MachineInstr *MI) {
     CI = CSEMap.find(Opcode);
   if (!EliminateCSE(MI, CI)) {
     // Otherwise, splice the instruction to the preheader.
-    CurPreheader->splice(CurPreheader->getFirstTerminator(),MI->getParent(),MI);
+    Preheader->splice(Preheader->getFirstTerminator(),MI->getParent(),MI);
 
     // Clear the kill flags of any register this instruction defines,
     // since they may need to be live throughout the entire loop
@@ -808,3 +811,30 @@ void MachineLICM::Hoist(MachineInstr *MI) {
   ++NumHoisted;
   Changed = true;
 }
+
+MachineBasicBlock *MachineLICM::getCurPreheader() {
+  // Determine the block to which to hoist instructions. If we can't find a
+  // suitable loop predecessor, we can't do any hoisting.
+
+  // If we've tried to get a preheader and failed, don't try again.
+  if (CurPreheader == reinterpret_cast<MachineBasicBlock *>(-1))
+    return 0;
+
+  if (!CurPreheader) {
+    CurPreheader = CurLoop->getLoopPreheader();
+    if (!CurPreheader) {
+      MachineBasicBlock *Pred = CurLoop->getLoopPredecessor();
+      if (!Pred) {
+        CurPreheader = reinterpret_cast<MachineBasicBlock *>(-1);
+        return 0;
+      }
+
+      CurPreheader = Pred->SplitCriticalEdge(CurLoop->getHeader(), this);
+      if (!CurPreheader) {
+        CurPreheader = reinterpret_cast<MachineBasicBlock *>(-1);
+        return 0;
+      }
+    }
+  }
+  return CurPreheader;
+}
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 70bf7e5da5e8..5d852f26beda 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -20,7 +20,7 @@ using namespace llvm;
 MachineRegisterInfo::MachineRegisterInfo(const TargetRegisterInfo &TRI) {
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
-  RegClass2VRegMap.resize(TRI.getNumRegClasses()+1); // RC ID starts at 1.
+  RegClass2VRegMap = new std::vector<unsigned>[TRI.getNumRegClasses()];
   UsedPhysRegs.resize(TRI.getNumRegs());
   
   // Create the physreg use/def lists.
@@ -37,6 +37,7 @@ MachineRegisterInfo::~MachineRegisterInfo() {
            "PhysRegUseDefLists has entries after all instructions are deleted");
 #endif
   delete [] PhysRegUseDefLists;
+  delete [] RegClass2VRegMap;
 }
 
 /// setRegClass - Set the register class of the specified virtual register.
@@ -52,7 +53,7 @@ MachineRegisterInfo::setRegClass(unsigned Reg, const TargetRegisterClass *RC) {
   // Remove from old register class's vregs list. This may be slow but
   // fortunately this operation is rarely needed.
   std::vector<unsigned> &VRegs = RegClass2VRegMap[OldRC->getID()];
-  std::vector<unsigned>::iterator I=std::find(VRegs.begin(), VRegs.end(), VR);
+  std::vector<unsigned>::iterator I = std::find(VRegs.begin(), VRegs.end(), VR);
   VRegs.erase(I);
 
   // Add to new register class's vregs list.
@@ -174,115 +175,36 @@ unsigned MachineRegisterInfo::getLiveInVirtReg(unsigned PReg) const {
   return 0;
 }
 
-static cl::opt<bool>
-SchedLiveInCopies("schedule-livein-copies", cl::Hidden,
-                  cl::desc("Schedule copies of livein registers"),
-                  cl::init(false));
-
-/// EmitLiveInCopy - Emit a copy for a live in physical register. If the
-/// physical register has only a single copy use, then coalesced the copy
-/// if possible.
-static void EmitLiveInCopy(MachineBasicBlock *MBB,
-                           MachineBasicBlock::iterator &InsertPos,
-                           unsigned VirtReg, unsigned PhysReg,
-                           const TargetRegisterClass *RC,
-                           DenseMap<MachineInstr*, unsigned> &CopyRegMap,
-                           const MachineRegisterInfo &MRI,
-                           const TargetRegisterInfo &TRI,
-                           const TargetInstrInfo &TII) {
-  unsigned NumUses = 0;
-  MachineInstr *UseMI = NULL;
-  for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(VirtReg),
-         UE = MRI.use_end(); UI != UE; ++UI) {
-    UseMI = &*UI;
-    if (++NumUses > 1)
-      break;
-  }
-
-  // If the number of uses is not one, or the use is not a move instruction,
-  // don't coalesce. Also, only coalesce away a virtual register to virtual
-  // register copy.
-  bool Coalesced = false;
-  unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-  if (NumUses == 1 &&
-      TII.isMoveInstr(*UseMI, SrcReg, DstReg, SrcSubReg, DstSubReg) &&
-      TargetRegisterInfo::isVirtualRegister(DstReg)) {
-    VirtReg = DstReg;
-    Coalesced = true;
-  }
-
-  // Now find an ideal location to insert the copy.
-  MachineBasicBlock::iterator Pos = InsertPos;
-  while (Pos != MBB->begin()) {
-    MachineInstr *PrevMI = prior(Pos);
-    DenseMap<MachineInstr*, unsigned>::iterator RI = CopyRegMap.find(PrevMI);
-    // copyRegToReg might emit multiple instructions to do a copy.
-    unsigned CopyDstReg = (RI == CopyRegMap.end()) ? 0 : RI->second;
-    if (CopyDstReg && !TRI.regsOverlap(CopyDstReg, PhysReg))
-      // This is what the BB looks like right now:
-      // r1024 = mov r0
-      // ...
-      // r1    = mov r1024
-      //
-      // We want to insert "r1025 = mov r1". Inserting this copy below the
-      // move to r1024 makes it impossible for that move to be coalesced.
-      //
-      // r1025 = mov r1
-      // r1024 = mov r0
-      // ...
-      // r1    = mov 1024
-      // r2    = mov 1025
-      break; // Woot! Found a good location.
-    --Pos;
-  }
-
-  bool Emitted = TII.copyRegToReg(*MBB, Pos, VirtReg, PhysReg, RC, RC,
-                                  DebugLoc());
-  assert(Emitted && "Unable to issue a live-in copy instruction!\n");
-  (void) Emitted;
-
-  CopyRegMap.insert(std::make_pair(prior(Pos), VirtReg));
-  if (Coalesced) {
-    if (&*InsertPos == UseMI) ++InsertPos;
-    MBB->erase(UseMI);
-  }
-}
-
 /// EmitLiveInCopies - Emit copies to initialize livein virtual registers
 /// into the given entry block.
 void
 MachineRegisterInfo::EmitLiveInCopies(MachineBasicBlock *EntryMBB,
                                       const TargetRegisterInfo &TRI,
                                       const TargetInstrInfo &TII) {
-  if (SchedLiveInCopies) {
-    // Emit the copies at a heuristically-determined location in the block.
-    DenseMap<MachineInstr*, unsigned> CopyRegMap;
-    MachineBasicBlock::iterator InsertPos = EntryMBB->begin();
-    for (MachineRegisterInfo::livein_iterator LI = livein_begin(),
-           E = livein_end(); LI != E; ++LI)
-      if (LI->second) {
-        const TargetRegisterClass *RC = getRegClass(LI->second);
-        EmitLiveInCopy(EntryMBB, InsertPos, LI->second, LI->first,
-                       RC, CopyRegMap, *this, TRI, TII);
+  // Emit the copies into the top of the block.
+  for (unsigned i = 0, e = LiveIns.size(); i != e; ++i)
+    if (LiveIns[i].second) {
+      if (use_empty(LiveIns[i].second)) {
+        // The livein has no uses. Drop it.
+        //
+        // It would be preferable to have isel avoid creating live-in
+        // records for unused arguments in the first place, but it's
+        // complicated by the debug info code for arguments.
+        LiveIns.erase(LiveIns.begin() + i);
+        --i; --e;
+      } else {
+        // Emit a copy.
+        BuildMI(*EntryMBB, EntryMBB->begin(), DebugLoc(),
+                TII.get(TargetOpcode::COPY), LiveIns[i].second)
+          .addReg(LiveIns[i].first);
+
+        // Add the register to the entry block live-in set.
+        EntryMBB->addLiveIn(LiveIns[i].first);
       }
-  } else {
-    // Emit the copies into the top of the block.
-    for (MachineRegisterInfo::livein_iterator LI = livein_begin(),
-           E = livein_end(); LI != E; ++LI)
-      if (LI->second) {
-        const TargetRegisterClass *RC = getRegClass(LI->second);
-        bool Emitted = TII.copyRegToReg(*EntryMBB, EntryMBB->begin(),
-                                        LI->second, LI->first, RC, RC,
-                                        DebugLoc());
-        assert(Emitted && "Unable to issue a live-in copy instruction!\n");
-        (void) Emitted;
-      }
-  }
-
-  // Add function live-ins to entry block live-in set.
-  for (MachineRegisterInfo::livein_iterator I = livein_begin(),
-       E = livein_end(); I != E; ++I)
-    EntryMBB->addLiveIn(I->first);
+    } else {
+      // Add the register to the entry block live-in set.
+      EntryMBB->addLiveIn(LiveIns[i].first);
+    }
 }
 
 void MachineRegisterInfo::closePhysRegsUsed(const TargetRegisterInfo &TRI) {
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index 1610e6c9610c..61334fc1790a 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass moves instructions into successor blocks, when possible, so that
+// This pass moves instructions into successor blocks when possible, so that
 // they aren't executed on paths where their results aren't needed.
 //
 // This pass is not intended to be a replacement or a complete alternative
@@ -45,9 +45,9 @@ namespace {
   public:
     static char ID; // Pass identification
     MachineSinking() : MachineFunctionPass(&ID) {}
-    
+
     virtual bool runOnMachineFunction(MachineFunction &MF);
-    
+
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
       MachineFunctionPass::getAnalysisUsage(AU);
@@ -63,7 +63,7 @@ namespace {
     bool AllUsesDominatedByBlock(unsigned Reg, MachineBasicBlock *MBB) const;
   };
 } // end anonymous namespace
-  
+
 char MachineSinking::ID = 0;
 static RegisterPass<MachineSinking>
 X("machine-sink", "Machine code sinking");
@@ -72,7 +72,7 @@ FunctionPass *llvm::createMachineSinkingPass() { return new MachineSinking(); }
 
 /// AllUsesDominatedByBlock - Return true if all uses of the specified register
 /// occur in blocks dominated by the specified block.
-bool MachineSinking::AllUsesDominatedByBlock(unsigned Reg, 
+bool MachineSinking::AllUsesDominatedByBlock(unsigned Reg,
                                              MachineBasicBlock *MBB) const {
   assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
          "Only makes sense for vregs");
@@ -80,27 +80,30 @@ bool MachineSinking::AllUsesDominatedByBlock(unsigned Reg,
   // This may leave a referencing dbg_value in the original block, before
   // the definition of the vreg.  Dwarf generator handles this although the
   // user might not get the right info at runtime.
-  for (MachineRegisterInfo::use_nodbg_iterator I = 
-       RegInfo->use_nodbg_begin(Reg),
-       E = RegInfo->use_nodbg_end(); I != E; ++I) {
+  for (MachineRegisterInfo::use_nodbg_iterator
+         I = RegInfo->use_nodbg_begin(Reg), E = RegInfo->use_nodbg_end();
+       I != E; ++I) {
     // Determine the block of the use.
     MachineInstr *UseInst = &*I;
     MachineBasicBlock *UseBlock = UseInst->getParent();
+
     if (UseInst->isPHI()) {
       // PHI nodes use the operand in the predecessor block, not the block with
       // the PHI.
       UseBlock = UseInst->getOperand(I.getOperandNo()+1).getMBB();
     }
+
     // Check that it dominates.
     if (!DT->dominates(MBB, UseBlock))
       return false;
   }
+
   return true;
 }
 
 bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "******** Machine Sinking ********\n");
-  
+
   const TargetMachine &TM = MF.getTarget();
   TII = TM.getInstrInfo();
   TRI = TM.getRegisterInfo();
@@ -111,19 +114,19 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
   AllocatableSet = TRI->getAllocatableSet(MF);
 
   bool EverMadeChange = false;
-  
+
   while (1) {
     bool MadeChange = false;
 
     // Process all basic blocks.
-    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); 
+    for (MachineFunction::iterator I = MF.begin(), E = MF.end();
          I != E; ++I)
       MadeChange |= ProcessBlock(*I);
-    
+
     // If this iteration over the code changed anything, keep iterating.
     if (!MadeChange) break;
     EverMadeChange = true;
-  } 
+  }
   return EverMadeChange;
 }
 
@@ -132,8 +135,8 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
   if (MBB.succ_size() <= 1 || MBB.empty()) return false;
 
   // Don't bother sinking code out of unreachable blocks. In addition to being
-  // unprofitable, it can also lead to infinite looping, because in an unreachable
-  // loop there may be nowhere to stop.
+  // unprofitable, it can also lead to infinite looping, because in an
+  // unreachable loop there may be nowhere to stop.
   if (!DT->isReachableFromEntry(&MBB)) return false;
 
   bool MadeChange = false;
@@ -144,7 +147,7 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
   bool ProcessedBegin, SawStore = false;
   do {
     MachineInstr *MI = I;  // The instruction to sink.
-    
+
     // Predecrement I (if it's not begin) so that it isn't invalidated by
     // sinking.
     ProcessedBegin = I == MBB.begin();
@@ -156,10 +159,10 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
 
     if (SinkInstruction(MI, SawStore))
       ++NumSunk, MadeChange = true;
-    
+
     // If we just processed the first instruction in the block, we're done.
   } while (!ProcessedBegin);
-  
+
   return MadeChange;
 }
 
@@ -169,7 +172,7 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
   // Check if it's safe to move the instruction.
   if (!MI->isSafeToMove(TII, AA, SawStore))
     return false;
-  
+
   // FIXME: This should include support for sinking instructions within the
   // block they are currently in to shorten the live ranges.  We often get
   // instructions sunk into the top of a large block, but it would be better to
@@ -177,22 +180,22 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
   // be careful not to *increase* register pressure though, e.g. sinking
   // "x = y + z" down if it kills y and z would increase the live ranges of y
   // and z and only shrink the live range of x.
-  
+
   // Loop over all the operands of the specified instruction.  If there is
   // anything we can't handle, bail out.
   MachineBasicBlock *ParentBlock = MI->getParent();
-  
+
   // SuccToSinkTo - This is the successor to sink this instruction to, once we
   // decide.
   MachineBasicBlock *SuccToSinkTo = 0;
-  
+
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg()) continue;  // Ignore non-register operands.
-    
+
     unsigned Reg = MO.getReg();
     if (Reg == 0) continue;
-    
+
     if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
       if (MO.isUse()) {
         // If the physreg has no defs anywhere, it's just an ambient register
@@ -200,13 +203,16 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
         // it could get allocated to something with a def during allocation.
         if (!RegInfo->def_empty(Reg))
           return false;
+
         if (AllocatableSet.test(Reg))
           return false;
+
         // Check for a def among the register's aliases too.
         for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
           unsigned AliasReg = *Alias;
           if (!RegInfo->def_empty(AliasReg))
             return false;
+
           if (AllocatableSet.test(AliasReg))
             return false;
         }
@@ -221,28 +227,31 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
       // If it's not safe to move defs of the register class, then abort.
       if (!TII->isSafeToMoveRegClassDefs(RegInfo->getRegClass(Reg)))
         return false;
-      
+
       // FIXME: This picks a successor to sink into based on having one
       // successor that dominates all the uses.  However, there are cases where
       // sinking can happen but where the sink point isn't a successor.  For
       // example:
+      //
       //   x = computation
       //   if () {} else {}
       //   use x
-      // the instruction could be sunk over the whole diamond for the 
+      //
+      // the instruction could be sunk over the whole diamond for the
       // if/then/else (or loop, etc), allowing it to be sunk into other blocks
       // after that.
-      
+
       // Virtual register defs can only be sunk if all their uses are in blocks
       // dominated by one of the successors.
       if (SuccToSinkTo) {
         // If a previous operand picked a block to sink to, then this operand
         // must be sinkable to the same block.
-        if (!AllUsesDominatedByBlock(Reg, SuccToSinkTo)) 
+        if (!AllUsesDominatedByBlock(Reg, SuccToSinkTo))
           return false;
+
         continue;
       }
-      
+
       // Otherwise, we should look at all the successors and decide which one
       // we should sink to.
       for (MachineBasicBlock::succ_iterator SI = ParentBlock->succ_begin(),
@@ -252,13 +261,13 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
           break;
         }
       }
-      
+
       // If we couldn't find a block to sink to, ignore this instruction.
       if (SuccToSinkTo == 0)
         return false;
     }
   }
-  
+
   // If there are no outputs, it must have side-effects.
   if (SuccToSinkTo == 0)
     return false;
@@ -267,15 +276,26 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
   // landing pad is implicitly defined.
   if (SuccToSinkTo->isLandingPad())
     return false;
-  
+
   // It is not possible to sink an instruction into its own block.  This can
   // happen with loops.
   if (MI->getParent() == SuccToSinkTo)
     return false;
-  
-  DEBUG(dbgs() << "Sink instr " << *MI);
-  DEBUG(dbgs() << "to block " << *SuccToSinkTo);
-  
+
+  // If the instruction to move defines a dead physical register which is live
+  // when leaving the basic block, don't move it because it could turn into a
+  // "zombie" define of that preg. E.g., EFLAGS. (<rdar://problem/8030636>)
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    if (SuccToSinkTo->isLiveIn(Reg))
+      return false;
+  }
+
+  DEBUG(dbgs() << "Sink instr " << *MI << "\tinto block " << *SuccToSinkTo);
+
   // If the block has multiple predecessors, this would introduce computation on
   // a path that it doesn't already exist.  We could split the critical edge,
   // but for now we just punt.
@@ -305,18 +325,18 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
     // Otherwise we are OK with sinking along a critical edge.
     DEBUG(dbgs() << "Sinking along critical edge.\n");
   }
-  
-  // Determine where to insert into.  Skip phi nodes.
+
+  // Determine where to insert into. Skip phi nodes.
   MachineBasicBlock::iterator InsertPos = SuccToSinkTo->begin();
   while (InsertPos != SuccToSinkTo->end() && InsertPos->isPHI())
     ++InsertPos;
-  
+
   // Move the instruction.
   SuccToSinkTo->splice(InsertPos, ParentBlock, MI,
                        ++MachineBasicBlock::iterator(MI));
 
-  // Conservatively, clear any kill flags, since it's possible that
-  // they are no longer correct.
+  // Conservatively, clear any kill flags, since it's possible that they are no
+  // longer correct.
   MI->clearKillInfo();
 
   return true;
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 8baf01c90736..2297c908b1e0 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -390,7 +390,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
         report("MBB exits via unconditional fall-through but its successor "
                "differs from its CFG successor!", MBB);
       }
-      if (!MBB->empty() && MBB->back().getDesc().isBarrier()) {
+      if (!MBB->empty() && MBB->back().getDesc().isBarrier() &&
+          !TII->isPredicated(&MBB->back())) {
         report("MBB exits via unconditional fall-through but ends with a "
                "barrier instruction!", MBB);
       }
diff --git a/lib/CodeGen/OptimizeExts.cpp b/lib/CodeGen/OptimizeExts.cpp
index 41fc20407441..dcdc243e5db3 100644
--- a/lib/CodeGen/OptimizeExts.cpp
+++ b/lib/CodeGen/OptimizeExts.cpp
@@ -118,6 +118,26 @@ bool OptimizeExts::OptimizeInstr(MachineInstr *MI, MachineBasicBlock *MBB,
         continue;
       }
 
+      // It's an error to translate this:
+      //
+      //    %reg1025 = <sext> %reg1024
+      //     ...
+      //    %reg1026 = SUBREG_TO_REG 0, %reg1024, 4
+      //
+      // into this:
+      //
+      //    %reg1025 = <sext> %reg1024
+      //     ...
+      //    %reg1027 = COPY %reg1025:4
+      //    %reg1026 = SUBREG_TO_REG 0, %reg1027, 4
+      //
+      // The problem here is that SUBREG_TO_REG is there to assert that an
+      // implicit zext occurs. It doesn't insert a zext instruction. If we allow
+      // the COPY here, it will give us the value after the <sext>,
+      // not the original value of %reg1024 before <sext>.
+      if (UseMI->getOpcode() == TargetOpcode::SUBREG_TO_REG)
+        continue;
+
       MachineBasicBlock *UseMBB = UseMI->getParent();
       if (UseMBB == MBB) {
         // Local uses that come after the extension.
@@ -165,8 +185,8 @@ bool OptimizeExts::OptimizeInstr(MachineInstr *MI, MachineBasicBlock *MBB,
           continue;
         unsigned NewVR = MRI->createVirtualRegister(RC);
         BuildMI(*UseMBB, UseMI, UseMI->getDebugLoc(),
-                TII->get(TargetOpcode::EXTRACT_SUBREG), NewVR)
-          .addReg(DstReg).addImm(SubIdx);
+                TII->get(TargetOpcode::COPY), NewVR)
+          .addReg(DstReg, 0, SubIdx);
         UseMO->setReg(NewVR);
         ++NumReuse;
         Changed = true;
diff --git a/lib/CodeGen/OptimizePHIs.cpp b/lib/CodeGen/OptimizePHIs.cpp
index 2717d4d5cefc..1613fe21e42d 100644
--- a/lib/CodeGen/OptimizePHIs.cpp
+++ b/lib/CodeGen/OptimizePHIs.cpp
@@ -107,6 +107,11 @@ bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI,
         SrcSubIdx == 0 && DstSubIdx == 0 &&
         TargetRegisterInfo::isVirtualRegister(MvSrcReg))
       SrcMI = MRI->getVRegDef(MvSrcReg);
+    else if (SrcMI && SrcMI->isCopy() &&
+             !SrcMI->getOperand(0).getSubReg() &&
+             !SrcMI->getOperand(1).getSubReg() &&
+           TargetRegisterInfo::isVirtualRegister(SrcMI->getOperand(1).getReg()))
+      SrcMI = MRI->getVRegDef(SrcMI->getOperand(1).getReg());
     if (!SrcMI)
       return false;
 
diff --git a/lib/CodeGen/PBQP/HeuristicSolver.h b/lib/CodeGen/PBQP/HeuristicSolver.h
index bd18b5216f37..02938df00700 100644
--- a/lib/CodeGen/PBQP/HeuristicSolver.h
+++ b/lib/CodeGen/PBQP/HeuristicSolver.h
@@ -406,7 +406,7 @@ namespace PBQP {
 
       // Create node data objects.
       for (Graph::NodeItr nItr = g.nodesBegin(), nEnd = g.nodesEnd();
-	       nItr != nEnd; ++nItr) {
+           nItr != nEnd; ++nItr) {
         nodeDataList.push_back(NodeData());
         g.setNodeData(nItr, &nodeDataList.back());
       }
diff --git a/lib/CodeGen/PBQP/Heuristics/Briggs.h b/lib/CodeGen/PBQP/Heuristics/Briggs.h
index 30d34d9e3e92..4c1ce119ed05 100644
--- a/lib/CodeGen/PBQP/Heuristics/Briggs.h
+++ b/lib/CodeGen/PBQP/Heuristics/Briggs.h
@@ -18,7 +18,6 @@
 #ifndef LLVM_CODEGEN_PBQP_HEURISTICS_BRIGGS_H
 #define LLVM_CODEGEN_PBQP_HEURISTICS_BRIGGS_H
 
-#include "llvm/Support/Compiler.h"
 #include "../HeuristicSolver.h"
 #include "../HeuristicBase.h"
 
@@ -267,8 +266,8 @@ namespace PBQP {
         if (!nd.isHeuristic)
           return;
 
-        EdgeData &ed ATTRIBUTE_UNUSED = getHeuristicEdgeData(eItr);
-
+        EdgeData &ed = getHeuristicEdgeData(eItr);
+        (void)ed;
         assert(ed.isUpToDate && "Edge data is not up to date.");
 
         // Update node.
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index edbc13f3ff7f..ea6b094d7efe 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -34,7 +34,6 @@
 using namespace llvm;
 
 STATISTIC(NumAtomic, "Number of atomic phis lowered");
-STATISTIC(NumSplits, "Number of critical edges split on demand");
 STATISTIC(NumReused, "Number of reused lowered phis");
 
 char PHIElimination::ID = 0;
@@ -184,7 +183,6 @@ void llvm::PHIElimination::LowerAtomicPHINode(
 
   // Create a new register for the incoming PHI arguments.
   MachineFunction &MF = *MBB.getParent();
-  const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(DestReg);
   unsigned IncomingReg = 0;
   bool reusedIncoming = false;  // Is IncomingReg reused from an earlier PHI?
 
@@ -208,10 +206,12 @@ void llvm::PHIElimination::LowerAtomicPHINode(
       ++NumReused;
       DEBUG(dbgs() << "Reusing %reg" << IncomingReg << " for " << *MPhi);
     } else {
+      const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(DestReg);
       entry = IncomingReg = MF.getRegInfo().createVirtualRegister(RC);
     }
-    TII->copyRegToReg(MBB, AfterPHIsIt, DestReg, IncomingReg, RC, RC,
-                      MPhi->getDebugLoc());
+    BuildMI(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
+            TII->get(TargetOpcode::COPY), DestReg)
+      .addReg(IncomingReg);
   }
 
   // Update live variable information if there is any.
@@ -293,8 +293,8 @@ void llvm::PHIElimination::LowerAtomicPHINode(
 
     // Insert the copy.
     if (!reusedIncoming && IncomingReg)
-      TII->copyRegToReg(opBlock, InsertPos, IncomingReg, SrcReg, RC, RC,
-                        MPhi->getDebugLoc());
+      BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), IncomingReg).addReg(SrcReg);
 
     // Now update live variable information if we have it.  Otherwise we're done
     if (!LV) continue;
@@ -391,57 +391,8 @@ bool llvm::PHIElimination::SplitPHIEdges(MachineFunction &MF,
       // (not considering PHI nodes). If the register is live in to this block
       // anyway, we would gain nothing from splitting.
       if (!LV.isLiveIn(Reg, MBB) && LV.isLiveOut(Reg, *PreMBB))
-        SplitCriticalEdge(PreMBB, &MBB);
+        PreMBB->SplitCriticalEdge(&MBB, this);
     }
   }
   return true;
 }
-
-MachineBasicBlock *PHIElimination::SplitCriticalEdge(MachineBasicBlock *A,
-                                                     MachineBasicBlock *B) {
-  assert(A && B && "Missing MBB end point");
-
-  MachineFunction *MF = A->getParent();
-
-  // We may need to update A's terminator, but we can't do that if AnalyzeBranch
-  // fails. If A uses a jump table, we won't touch it.
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
-  MachineBasicBlock *TBB = 0, *FBB = 0;
-  SmallVector<MachineOperand, 4> Cond;
-  if (TII->AnalyzeBranch(*A, TBB, FBB, Cond))
-    return NULL;
-
-  ++NumSplits;
-
-  MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock();
-  MF->insert(llvm::next(MachineFunction::iterator(A)), NMBB);
-  DEBUG(dbgs() << "PHIElimination splitting critical edge:"
-        " BB#" << A->getNumber()
-        << " -- BB#" << NMBB->getNumber()
-        << " -- BB#" << B->getNumber() << '\n');
-
-  A->ReplaceUsesOfBlockWith(B, NMBB);
-  A->updateTerminator();
-
-  // Insert unconditional "jump B" instruction in NMBB if necessary.
-  NMBB->addSuccessor(B);
-  if (!NMBB->isLayoutSuccessor(B)) {
-    Cond.clear();
-    MF->getTarget().getInstrInfo()->InsertBranch(*NMBB, B, NULL, Cond);
-  }
-
-  // Fix PHI nodes in B so they refer to NMBB instead of A
-  for (MachineBasicBlock::iterator i = B->begin(), e = B->end();
-       i != e && i->isPHI(); ++i)
-    for (unsigned ni = 1, ne = i->getNumOperands(); ni != ne; ni += 2)
-      if (i->getOperand(ni+1).getMBB() == A)
-        i->getOperand(ni+1).setMBB(NMBB);
-
-  if (LiveVariables *LV=getAnalysisIfAvailable<LiveVariables>())
-    LV->addNewBlock(NMBB, A, B);
-
-  if (MachineDominatorTree *MDT=getAnalysisIfAvailable<MachineDominatorTree>())
-    MDT->addNewBlock(NMBB, A);
-
-  return NMBB;
-}
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index 5ea2941b483c..3489db2e9f4f 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -24,6 +24,11 @@ using namespace llvm;
 //===---------------------------------------------------------------------===//
 MachinePassRegistry RegisterRegAlloc::Registry;
 
+static FunctionPass *createDefaultRegisterAllocator() { return 0; }
+static RegisterRegAlloc
+defaultRegAlloc("default",
+                "pick register allocator based on -O option",
+                createDefaultRegisterAllocator);
 
 //===---------------------------------------------------------------------===//
 ///
@@ -33,8 +38,8 @@ MachinePassRegistry RegisterRegAlloc::Registry;
 static cl::opt<RegisterRegAlloc::FunctionPassCtor, false,
                RegisterPassParser<RegisterRegAlloc> >
 RegAlloc("regalloc",
-         cl::init(&createLinearScanRegisterAllocator),
-         cl::desc("Register allocator to use (default=linearscan)")); 
+         cl::init(&createDefaultRegisterAllocator),
+         cl::desc("Register allocator to use"));
 
 
 //===---------------------------------------------------------------------===//
@@ -42,13 +47,22 @@ RegAlloc("regalloc",
 /// createRegisterAllocator - choose the appropriate register allocator.
 ///
 //===---------------------------------------------------------------------===//
-FunctionPass *llvm::createRegisterAllocator() {
+FunctionPass *llvm::createRegisterAllocator(CodeGenOpt::Level OptLevel) {
   RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault();
-  
+
   if (!Ctor) {
     Ctor = RegAlloc;
     RegisterRegAlloc::setDefault(RegAlloc);
   }
-  
-  return Ctor();
+
+  if (Ctor != createDefaultRegisterAllocator)
+    return Ctor();
+
+  // When the 'default' allocator is requested, pick one based on OptLevel.
+  switch (OptLevel) {
+  case CodeGenOpt::None:
+    return createFastRegisterAllocator();
+  default:
+    return createLinearScanRegisterAllocator();
+  }
 }
diff --git a/lib/CodeGen/PostRAHazardRecognizer.cpp b/lib/CodeGen/PostRAHazardRecognizer.cpp
new file mode 100644
index 000000000000..cbde2b01eeaf
--- /dev/null
+++ b/lib/CodeGen/PostRAHazardRecognizer.cpp
@@ -0,0 +1,180 @@
+//===----- PostRAHazardRecognizer.cpp - hazard recognizer -------- ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a hazard recognizer using the instructions itineraries
+// defined for the current target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "post-RA-sched"
+#include "llvm/CodeGen/PostRAHazardRecognizer.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrItineraries.h"
+
+using namespace llvm;
+
+PostRAHazardRecognizer::
+PostRAHazardRecognizer(const InstrItineraryData &LItinData) :
+  ScheduleHazardRecognizer(), ItinData(LItinData) {
+  // Determine the maximum depth of any itinerary. This determines the
+  // depth of the scoreboard. We always make the scoreboard at least 1
+  // cycle deep to avoid dealing with the boundary condition.
+  unsigned ScoreboardDepth = 1;
+  if (!ItinData.isEmpty()) {
+    for (unsigned idx = 0; ; ++idx) {
+      if (ItinData.isEndMarker(idx))
+        break;
+
+      const InstrStage *IS = ItinData.beginStage(idx);
+      const InstrStage *E = ItinData.endStage(idx);
+      unsigned ItinDepth = 0;
+      for (; IS != E; ++IS)
+        ItinDepth += IS->getCycles();
+
+      ScoreboardDepth = std::max(ScoreboardDepth, ItinDepth);
+    }
+  }
+
+  ReservedScoreboard.reset(ScoreboardDepth);
+  RequiredScoreboard.reset(ScoreboardDepth);
+
+  DEBUG(dbgs() << "Using post-ra hazard recognizer: ScoreboardDepth = " 
+               << ScoreboardDepth << '\n');
+}
+
+void PostRAHazardRecognizer::Reset() {
+  RequiredScoreboard.reset();
+  ReservedScoreboard.reset();
+}
+
+void PostRAHazardRecognizer::ScoreBoard::dump() const {
+  dbgs() << "Scoreboard:\n";
+
+  unsigned last = Depth - 1;
+  while ((last > 0) && ((*this)[last] == 0))
+    last--;
+
+  for (unsigned i = 0; i <= last; i++) {
+    unsigned FUs = (*this)[i];
+    dbgs() << "\t";
+    for (int j = 31; j >= 0; j--)
+      dbgs() << ((FUs & (1 << j)) ? '1' : '0');
+    dbgs() << '\n';
+  }
+}
+
+ScheduleHazardRecognizer::HazardType
+PostRAHazardRecognizer::getHazardType(SUnit *SU) {
+  if (ItinData.isEmpty())
+    return NoHazard;
+
+  unsigned cycle = 0;
+
+  // Use the itinerary for the underlying instruction to check for
+  // free FU's in the scoreboard at the appropriate future cycles.
+  unsigned idx = SU->getInstr()->getDesc().getSchedClass();
+  for (const InstrStage *IS = ItinData.beginStage(idx),
+         *E = ItinData.endStage(idx); IS != E; ++IS) {
+    // We must find one of the stage's units free for every cycle the
+    // stage is occupied. FIXME it would be more accurate to find the
+    // same unit free in all the cycles.
+    for (unsigned int i = 0; i < IS->getCycles(); ++i) {
+      assert(((cycle + i) < RequiredScoreboard.getDepth()) &&
+             "Scoreboard depth exceeded!");
+
+      unsigned freeUnits = IS->getUnits();
+      switch (IS->getReservationKind()) {
+      default:
+       assert(0 && "Invalid FU reservation");
+      case InstrStage::Required:
+        // Required FUs conflict with both reserved and required ones
+        freeUnits &= ~ReservedScoreboard[cycle + i];
+        // FALLTHROUGH
+      case InstrStage::Reserved:
+        // Reserved FUs can conflict only with required ones.
+        freeUnits &= ~RequiredScoreboard[cycle + i];
+        break;
+      }
+
+      if (!freeUnits) {
+        DEBUG(dbgs() << "*** Hazard in cycle " << (cycle + i) << ", ");
+        DEBUG(dbgs() << "SU(" << SU->NodeNum << "): ");
+        DEBUG(SU->getInstr()->dump());
+        return Hazard;
+      }
+    }
+
+    // Advance the cycle to the next stage.
+    cycle += IS->getNextCycles();
+  }
+
+  return NoHazard;
+}
+
+void PostRAHazardRecognizer::EmitInstruction(SUnit *SU) {
+  if (ItinData.isEmpty())
+    return;
+
+  unsigned cycle = 0;
+
+  // Use the itinerary for the underlying instruction to reserve FU's
+  // in the scoreboard at the appropriate future cycles.
+  unsigned idx = SU->getInstr()->getDesc().getSchedClass();
+  for (const InstrStage *IS = ItinData.beginStage(idx),
+         *E = ItinData.endStage(idx); IS != E; ++IS) {
+    // We must reserve one of the stage's units for every cycle the
+    // stage is occupied. FIXME it would be more accurate to reserve
+    // the same unit free in all the cycles.
+    for (unsigned int i = 0; i < IS->getCycles(); ++i) {
+      assert(((cycle + i) < RequiredScoreboard.getDepth()) &&
+             "Scoreboard depth exceeded!");
+
+      unsigned freeUnits = IS->getUnits();
+      switch (IS->getReservationKind()) {
+      default:
+       assert(0 && "Invalid FU reservation");
+      case InstrStage::Required:
+        // Required FUs conflict with both reserved and required ones
+        freeUnits &= ~ReservedScoreboard[cycle + i];
+        // FALLTHROUGH
+      case InstrStage::Reserved:
+        // Reserved FUs can conflict only with required ones.
+        freeUnits &= ~RequiredScoreboard[cycle + i];
+        break;
+      }
+
+      // reduce to a single unit
+      unsigned freeUnit = 0;
+      do {
+        freeUnit = freeUnits;
+        freeUnits = freeUnit & (freeUnit - 1);
+      } while (freeUnits);
+
+      assert(freeUnit && "No function unit available!");
+      if (IS->getReservationKind() == InstrStage::Required)
+        RequiredScoreboard[cycle + i] |= freeUnit;
+      else
+        ReservedScoreboard[cycle + i] |= freeUnit;
+    }
+
+    // Advance the cycle to the next stage.
+    cycle += IS->getNextCycles();
+  }
+
+  DEBUG(ReservedScoreboard.dump());
+  DEBUG(RequiredScoreboard.dump());
+}
+
+void PostRAHazardRecognizer::AdvanceCycle() {
+  ReservedScoreboard[0] = 0; ReservedScoreboard.advance();
+  RequiredScoreboard[0] = 0; RequiredScoreboard.advance();
+}
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 9714ea653b59..4af8e07f3480 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -22,8 +22,6 @@
 #include "AntiDepBreaker.h"
 #include "AggressiveAntiDepBreaker.h"
 #include "CriticalAntiDepBreaker.h"
-#include "ExactHazardRecognizer.h"
-#include "SimpleHazardRecognizer.h"
 #include "ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/LatencyPriorityQueue.h"
@@ -65,10 +63,6 @@ EnableAntiDepBreaking("break-anti-dependencies",
                       cl::desc("Break post-RA scheduling anti-dependencies: "
                                "\"critical\", \"all\", or \"none\""),
                       cl::init("none"), cl::Hidden);
-static cl::opt<bool>
-EnablePostRAHazardAvoidance("avoid-hazards",
-                      cl::desc("Enable exact hazard avoidance"),
-                      cl::init(true), cl::Hidden);
 
 // If DebugDiv > 0 then only schedule MBB with (ID % DebugDiv) == DebugMod
 static cl::opt<int>
@@ -85,6 +79,7 @@ AntiDepBreaker::~AntiDepBreaker() { }
 namespace {
   class PostRAScheduler : public MachineFunctionPass {
     AliasAnalysis *AA;
+    const TargetInstrInfo *TII;
     CodeGenOpt::Level OptLevel;
 
   public:
@@ -187,30 +182,9 @@ namespace {
   };
 }
 
-/// isSchedulingBoundary - Test if the given instruction should be
-/// considered a scheduling boundary. This primarily includes labels
-/// and terminators.
-///
-static bool isSchedulingBoundary(const MachineInstr *MI,
-                                 const MachineFunction &MF) {
-  // Terminators and labels can't be scheduled around.
-  if (MI->getDesc().isTerminator() || MI->isLabel())
-    return true;
-
-  // Don't attempt to schedule around any instruction that defines
-  // a stack-oriented pointer, as it's unlikely to be profitable. This
-  // saves compile time, because it doesn't require every single
-  // stack slot reference to depend on the instruction that does the
-  // modification.
-  const TargetLowering &TLI = *MF.getTarget().getTargetLowering();
-  if (MI->definesRegister(TLI.getStackPointerRegisterToSaveRestore()))
-    return true;
-
-  return false;
-}
-
 bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
   AA = &getAnalysis<AliasAnalysis>();
+  TII = Fn.getTarget().getInstrInfo();
 
   // Check for explicit enable/disable of post-ra scheduling.
   TargetSubtarget::AntiDepBreakMode AntiDepMode = TargetSubtarget::ANTIDEP_NONE;
@@ -237,10 +211,10 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
 
   const MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
   const MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
-  const InstrItineraryData &InstrItins = Fn.getTarget().getInstrItineraryData();
-  ScheduleHazardRecognizer *HR = EnablePostRAHazardAvoidance ?
-    (ScheduleHazardRecognizer *)new ExactHazardRecognizer(InstrItins) :
-    (ScheduleHazardRecognizer *)new SimpleHazardRecognizer();
+  const TargetMachine &TM = Fn.getTarget();
+  const InstrItineraryData &InstrItins = TM.getInstrItineraryData();
+  ScheduleHazardRecognizer *HR =
+    TM.getInstrInfo()->CreateTargetPostRAHazardRecognizer(InstrItins);
   AntiDepBreaker *ADB =
     ((AntiDepMode == TargetSubtarget::ANTIDEP_ALL) ?
      (AntiDepBreaker *)new AggressiveAntiDepBreaker(Fn, CriticalPathRCs) :
@@ -271,8 +245,8 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
     MachineBasicBlock::iterator Current = MBB->end();
     unsigned Count = MBB->size(), CurrentCount = Count;
     for (MachineBasicBlock::iterator I = Current; I != MBB->begin(); ) {
-      MachineInstr *MI = prior(I);
-      if (isSchedulingBoundary(MI, Fn)) {
+      MachineInstr *MI = llvm::prior(I);
+      if (TII->isSchedulingBoundary(MI, MBB, Fn)) {
         Scheduler.Run(MBB, I, Current, CurrentCount);
         Scheduler.EmitSchedule();
         Current = MI;
@@ -680,15 +654,6 @@ void SchedulePostRATDList::ListScheduleTopDown() {
       ScheduleNodeTopDown(FoundSUnit, CurCycle);
       HazardRec->EmitInstruction(FoundSUnit);
       CycleHasInsts = true;
-
-      // If we are using the target-specific hazards, then don't
-      // advance the cycle time just because we schedule a node. If
-      // the target allows it we can schedule multiple nodes in the
-      // same cycle.
-      if (!EnablePostRAHazardAvoidance) {
-        if (FoundSUnit->Latency)  // Don't increment CurCycle for pseudo-ops!
-          ++CurCycle;
-      }
     } else {
       if (CycleHasInsts) {
         DEBUG(dbgs() << "*** Finished cycle " << CurCycle << '\n');
diff --git a/lib/CodeGen/PreAllocSplitting.cpp b/lib/CodeGen/PreAllocSplitting.cpp
index 96e7327a7eb7..fb2f90935551 100644
--- a/lib/CodeGen/PreAllocSplitting.cpp
+++ b/lib/CodeGen/PreAllocSplitting.cpp
@@ -512,9 +512,6 @@ PreAllocSplitting::PerformPHIConstruction(MachineBasicBlock::iterator UseI,
     LI->addRange(LiveRange(UseIndex, EndIndex, RetVNI));
     
     // FIXME: Need to set kills properly for inter-block stuff.
-    if (RetVNI->isKill(UseIndex)) RetVNI->removeKill(UseIndex);
-    if (IsIntraBlock)
-      RetVNI->addKill(EndIndex);
   } else if (ContainsDefs && ContainsUses) {
     SmallPtrSet<MachineInstr*, 2>& BlockDefs = Defs[MBB];
     SmallPtrSet<MachineInstr*, 2>& BlockUses = Uses[MBB];
@@ -556,12 +553,6 @@ PreAllocSplitting::PerformPHIConstruction(MachineBasicBlock::iterator UseI,
                                       NewVNs, LiveOut, Phis, false, true);
 
     LI->addRange(LiveRange(StartIndex, EndIndex, RetVNI));
-    
-    if (foundUse && RetVNI->isKill(StartIndex))
-      RetVNI->removeKill(StartIndex);
-    if (IsIntraBlock) {
-      RetVNI->addKill(EndIndex);
-    }
   }
   
   // Memoize results so we don't have to recompute them.
@@ -636,9 +627,6 @@ PreAllocSplitting::PerformPHIConstructionFallBack(MachineBasicBlock::iterator Us
     for (DenseMap<MachineBasicBlock*, VNInfo*>::iterator I =
            IncomingVNs.begin(), E = IncomingVNs.end(); I != E; ++I) {
       I->second->setHasPHIKill(true);
-      SlotIndex KillIndex(LIs->getMBBEndIdx(I->first), true);
-      if (!I->second->isKill(KillIndex))
-        I->second->addKill(KillIndex);
     }
   }
       
@@ -648,8 +636,6 @@ PreAllocSplitting::PerformPHIConstructionFallBack(MachineBasicBlock::iterator Us
   } else
     EndIndex = LIs->getMBBEndIdx(MBB);
   LI->addRange(LiveRange(StartIndex, EndIndex, RetVNI));
-  if (IsIntraBlock)
-    RetVNI->addKill(EndIndex);
 
   // Memoize results so we don't have to recompute them.
   if (!IsIntraBlock)
@@ -691,10 +677,12 @@ void PreAllocSplitting::ReconstructLiveInterval(LiveInterval* LI) {
     
     // If the def is a move, set the copy field.
     unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-    if (TII->isMoveInstr(*DI, SrcReg, DstReg, SrcSubIdx, DstSubIdx))
+    if (TII->isMoveInstr(*DI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
       if (DstReg == LI->reg)
         NewVN->setCopy(&*DI);
-    
+    } else if (DI->isCopyLike() && DI->getOperand(0).getReg() == LI->reg)
+      NewVN->setCopy(&*DI);
+
     NewVNs[&*DI] = NewVN;
   }
   
@@ -725,25 +713,6 @@ void PreAllocSplitting::ReconstructLiveInterval(LiveInterval* LI) {
     
     VNInfo* DeadVN = NewVNs[&*DI];
     LI->addRange(LiveRange(DefIdx, DefIdx.getNextSlot(), DeadVN));
-    DeadVN->addKill(DefIdx);
-  }
-
-  // Update kill markers.
-  for (LiveInterval::vni_iterator VI = LI->vni_begin(), VE = LI->vni_end();
-       VI != VE; ++VI) {
-    VNInfo* VNI = *VI;
-    for (unsigned i = 0, e = VNI->kills.size(); i != e; ++i) {
-      SlotIndex KillIdx = VNI->kills[i];
-      if (KillIdx.isPHI())
-        continue;
-      MachineInstr *KillMI = LIs->getInstructionFromIndex(KillIdx);
-      if (KillMI) {
-        MachineOperand *KillMO = KillMI->findRegisterUseOperand(CurrLI->reg);
-        if (KillMO)
-          // It could be a dead def.
-          KillMO->setIsKill();
-      }
-    }
   }
 }
 
@@ -773,19 +742,14 @@ void PreAllocSplitting::RenumberValno(VNInfo* VN) {
     VNsToCopy.push_back(OldVN);
     
     // Locate two-address redefinitions
-    for (VNInfo::KillSet::iterator KI = OldVN->kills.begin(),
-         KE = OldVN->kills.end(); KI != KE; ++KI) {
-      assert(!KI->isPHI() &&
-             "VN previously reported having no PHI kills.");
-      MachineInstr* MI = LIs->getInstructionFromIndex(*KI);
-      unsigned DefIdx = MI->findRegisterDefOperandIdx(CurrLI->reg);
-      if (DefIdx == ~0U) continue;
-      if (MI->isRegTiedToUseOperand(DefIdx)) {
-        VNInfo* NextVN =
-          CurrLI->findDefinedVNInfoForRegInt(KI->getDefIndex());
-        if (NextVN == OldVN) continue;
+    for (MachineRegisterInfo::def_iterator DI = MRI->def_begin(CurrLI->reg),
+         DE = MRI->def_end(); DI != DE; ++DI) {
+      if (!DI->isRegTiedToUseOperand(DI.getOperandNo())) continue;
+      SlotIndex DefIdx = LIs->getInstructionIndex(&*DI).getDefIndex();
+      VNInfo* NextVN = CurrLI->findDefinedVNInfoForRegInt(DefIdx);
+      if (std::find(VNsToCopy.begin(), VNsToCopy.end(), NextVN) !=
+          VNsToCopy.end())
         Stack.push_back(NextVN);
-      }
     }
   }
   
@@ -836,7 +800,7 @@ void PreAllocSplitting::RenumberValno(VNInfo* VN) {
   if (IntervalSSMap.count(CurrLI->reg))
     IntervalSSMap[NewVReg] = IntervalSSMap[CurrLI->reg];
   
-  NumRenumbers++;
+  ++NumRenumbers;
 }
 
 bool PreAllocSplitting::Rematerialize(unsigned VReg, VNInfo* ValNo,
@@ -854,7 +818,7 @@ bool PreAllocSplitting::Rematerialize(unsigned VReg, VNInfo* ValNo,
   if (KillPt == DefMI->getParent()->end())
     return false;
   
-  TII->reMaterialize(MBB, RestorePt, VReg, 0, DefMI, TRI);
+  TII->reMaterialize(MBB, RestorePt, VReg, 0, DefMI, *TRI);
   SlotIndex RematIdx = LIs->InsertMachineInstrInMaps(prior(RestorePt));
   
   ReconstructLiveInterval(CurrLI);
@@ -899,12 +863,11 @@ MachineInstr* PreAllocSplitting::FoldSpill(unsigned vreg,
     SS = MFI->CreateSpillStackObject(RC->getSize(), RC->getAlignment());
   }
   
-  MachineInstr* FMI = TII->foldMemoryOperand(*MBB->getParent(),
-                                             FoldPt, Ops, SS);
+  MachineInstr* FMI = TII->foldMemoryOperand(FoldPt, Ops, SS);
   
   if (FMI) {
     LIs->ReplaceMachineInstrInMaps(FoldPt, FMI);
-    FMI = MBB->insert(MBB->erase(FoldPt), FMI);
+    FoldPt->eraseFromParent();
     ++NumFolds;
     
     IntervalSSMap[vreg] = SS;
@@ -980,12 +943,11 @@ MachineInstr* PreAllocSplitting::FoldRestore(unsigned vreg,
   if (!TII->canFoldMemoryOperand(FoldPt, Ops))
     return 0;
   
-  MachineInstr* FMI = TII->foldMemoryOperand(*MBB->getParent(),
-                                             FoldPt, Ops, SS);
+  MachineInstr* FMI = TII->foldMemoryOperand(FoldPt, Ops, SS);
   
   if (FMI) {
     LIs->ReplaceMachineInstrInMaps(FoldPt, FMI);
-    FMI = MBB->insert(MBB->erase(FoldPt), FMI);
+    FoldPt->eraseFromParent();
     ++NumRestoreFolds;
   }
   
@@ -1192,7 +1154,7 @@ unsigned PreAllocSplitting::getNumberOfNonSpills(
     int StoreFrameIndex;
     unsigned StoreVReg = TII->isStoreToStackSlot(*UI, StoreFrameIndex);
     if (StoreVReg != Reg || StoreFrameIndex != FrameIndex)
-      NonSpills++;
+      ++NonSpills;
     
     int DefIdx = (*UI)->findRegisterDefOperandIdx(Reg);
     if (DefIdx != -1 && (*UI)->isRegTiedToUseOperand(DefIdx))
@@ -1255,7 +1217,7 @@ bool PreAllocSplitting::removeDeadSpills(SmallPtrSet<LiveInterval*, 8>& split) {
         (*LI)->removeValNo(CurrVN);
         DefMI->eraseFromParent();
         VNUseCount.erase(CurrVN);
-        NumDeadSpills++;
+        ++NumDeadSpills;
         changed = true;
         continue;
       }
@@ -1291,9 +1253,7 @@ bool PreAllocSplitting::removeDeadSpills(SmallPtrSet<LiveInterval*, 8>& split) {
         Ops.push_back(OpIdx);
         if (!TII->canFoldMemoryOperand(use, Ops)) continue;
 
-        MachineInstr* NewMI =
-                          TII->foldMemoryOperand(*use->getParent()->getParent(),  
-                                                 use, Ops, FrameIndex);
+        MachineInstr* NewMI = TII->foldMemoryOperand(use, Ops, FrameIndex);
 
         if (!NewMI) continue;
 
@@ -1303,10 +1263,9 @@ bool PreAllocSplitting::removeDeadSpills(SmallPtrSet<LiveInterval*, 8>& split) {
         (*LI)->removeValNo(CurrVN);
 
         DefMI->eraseFromParent();
-        MachineBasicBlock* MBB = use->getParent();
-        NewMI = MBB->insert(MBB->erase(use), NewMI);
+        use->eraseFromParent();
         VNUseCount[CurrVN].erase(use);
-        
+
         // Remove deleted instructions.  Note that we need to remove them from 
         // the VNInfo->use map as well, just to be safe.
         for (SmallPtrSet<MachineInstr*, 4>::iterator II = 
@@ -1328,7 +1287,7 @@ bool PreAllocSplitting::removeDeadSpills(SmallPtrSet<LiveInterval*, 8>& split) {
           if (VI->second.erase(use))
             VI->second.insert(NewMI);
 
-        NumDeadSpills++;
+        ++NumDeadSpills;
         changed = true;
         continue;
       }
@@ -1350,7 +1309,7 @@ bool PreAllocSplitting::removeDeadSpills(SmallPtrSet<LiveInterval*, 8>& split) {
       LIs->RemoveMachineInstrFromMaps(DefMI);
       (*LI)->removeValNo(CurrVN);
       DefMI->eraseFromParent();
-      NumDeadSpills++;
+      ++NumDeadSpills;
       changed = true;
     }
   }
diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp
index 62f525fa1d97..ca4c47716875 100644
--- a/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -46,14 +46,14 @@ bool ProcessImplicitDefs::CanTurnIntoImplicitDef(MachineInstr *MI,
                                                  const TargetInstrInfo *tii_) {
   unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
   if (tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubReg, DstSubReg) &&
-      Reg == SrcReg && SrcSubReg == 0 && DstSubReg == 0)
+      Reg == SrcReg && DstSubReg == 0)
     return true;
 
-  if (OpIdx == 2 && MI->isSubregToReg())
-    return true;
-  if (OpIdx == 1 && MI->isExtractSubreg())
-    return true;
-  return false;
+  switch(OpIdx) {
+    case 1: return MI->isCopy() && MI->getOperand(0).getSubReg() == 0;
+    case 2: return MI->isSubregToReg() && MI->getOperand(0).getSubReg() == 0;
+    default: return false;
+  }
 }
 
 /// processImplicitDefs - Process IMPLICIT_DEF instructions and make sure
@@ -101,11 +101,10 @@ bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &fn) {
         continue;
       }
 
-      if (MI->isInsertSubreg()) {
-        MachineOperand &MO = MI->getOperand(2);
+      // Eliminate %reg1032:sub<def> = COPY undef.
+      if (MI->isCopy() && MI->getOperand(0).getSubReg()) {
+        MachineOperand &MO = MI->getOperand(1);
         if (ImpDefRegs.count(MO.getReg())) {
-          // %reg1032<def> = INSERT_SUBREG %reg1032, undef, 2
-          // This is an identity copy, eliminate it now.
           if (MO.isKill()) {
             LiveVariables::VarInfo& vi = lv_->getVarInfo(MO.getReg());
             vi.removeKill(MI);
@@ -119,7 +118,7 @@ bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &fn) {
       bool ChangedToImpDef = false;
       for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
         MachineOperand& MO = MI->getOperand(i);
-        if (!MO.isReg() || !MO.isUse() || MO.isUndef())
+        if (!MO.isReg() || (MO.isDef() && !MO.getSubReg()) || MO.isUndef())
           continue;
         unsigned Reg = MO.getReg();
         if (!Reg)
@@ -144,6 +143,12 @@ bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &fn) {
 
         Changed = true;
         MO.setIsUndef();
+        // This is a partial register redef of an implicit def.
+        // Make sure the whole register is defined by the instruction.
+        if (MO.isDef()) {
+          MI->addRegisterDefined(Reg);
+          continue;
+        }
         if (MO.isKill() || MI->isRegTiedToDefOperand(i)) {
           // Make sure other uses of 
           for (unsigned j = i+1; j != e; ++j) {
@@ -219,8 +224,10 @@ bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &fn) {
 
         // Turn a copy use into an implicit_def.
         unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-        if (tii_->isMoveInstr(*RMI, SrcReg, DstReg, SrcSubReg, DstSubReg) &&
-            Reg == SrcReg && SrcSubReg == 0 && DstSubReg == 0) {
+        if ((RMI->isCopy() && RMI->getOperand(1).getReg() == Reg &&
+             RMI->getOperand(0).getSubReg() == 0) ||
+            (tii_->isMoveInstr(*RMI, SrcReg, DstReg, SrcSubReg, DstSubReg) &&
+             Reg == SrcReg && DstSubReg == 0)) {
           RMI->setDesc(tii_->get(TargetOpcode::IMPLICIT_DEF));
 
           bool isKill = false;
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index e778024c6a3a..3843b2537051 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -158,9 +158,9 @@ void PEI::calculateCallsInformation(MachineFunction &Fn) {
         AdjustsStack = true;
         FrameSDOps.push_back(I);
       } else if (I->isInlineAsm()) {
-        // An InlineAsm might be a call; assume it is to get the stack frame
-        // aligned correctly for calls.
-        AdjustsStack = true;
+        // Some inline asm's need a stack frame, as indicated by operand 1.
+        if (I->getOperand(1).getImm())
+          AdjustsStack = true;
       }
 
   MFI->setAdjustsStack(AdjustsStack);
@@ -202,22 +202,17 @@ void PEI::calculateCalleeSavedRegisters(MachineFunction &Fn) {
   if (Fn.getFunction()->hasFnAttr(Attribute::Naked))
     return;
 
-  // Figure out which *callee saved* registers are modified by the current
-  // function, thus needing to be saved and restored in the prolog/epilog.
-  const TargetRegisterClass * const *CSRegClasses =
-    RegInfo->getCalleeSavedRegClasses(&Fn);
-
   std::vector<CalleeSavedInfo> CSI;
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
     if (Fn.getRegInfo().isPhysRegUsed(Reg)) {
       // If the reg is modified, save it!
-      CSI.push_back(CalleeSavedInfo(Reg, CSRegClasses[i]));
+      CSI.push_back(CalleeSavedInfo(Reg));
     } else {
       for (const unsigned *AliasSet = RegInfo->getAliasSet(Reg);
            *AliasSet; ++AliasSet) {  // Check alias registers too.
         if (Fn.getRegInfo().isPhysRegUsed(*AliasSet)) {
-          CSI.push_back(CalleeSavedInfo(Reg, CSRegClasses[i]));
+          CSI.push_back(CalleeSavedInfo(Reg));
           break;
         }
       }
@@ -236,7 +231,7 @@ void PEI::calculateCalleeSavedRegisters(MachineFunction &Fn) {
   for (std::vector<CalleeSavedInfo>::iterator
          I = CSI.begin(), E = CSI.end(); I != E; ++I) {
     unsigned Reg = I->getReg();
-    const TargetRegisterClass *RC = I->getRegClass();
+    const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
 
     int FrameIdx;
     if (RegInfo->hasReservedSpillSlot(Fn, Reg, FrameIdx)) {
@@ -265,8 +260,7 @@ void PEI::calculateCalleeSavedRegisters(MachineFunction &Fn) {
       if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
     } else {
       // Spill it to the stack where we must.
-      FrameIdx = MFI->CreateFixedObject(RC->getSize(), FixedSlot->Offset,
-                                        true, false);
+      FrameIdx = MFI->CreateFixedObject(RC->getSize(), FixedSlot->Offset, true);
     }
 
     I->setFrameIdx(FrameIdx);
@@ -303,8 +297,10 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
         EntryBlock->addLiveIn(CSI[i].getReg());
 
         // Insert the spill to the stack frame.
-        TII.storeRegToStackSlot(*EntryBlock, I, CSI[i].getReg(), true,
-                                CSI[i].getFrameIdx(), CSI[i].getRegClass(),TRI);
+        unsigned Reg = CSI[i].getReg();
+        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+        TII.storeRegToStackSlot(*EntryBlock, I, Reg, true,
+                                CSI[i].getFrameIdx(), RC, TRI);
       }
     }
 
@@ -328,9 +324,11 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
       // terminators that preceed it.
       if (!TII.restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) {
         for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-          TII.loadRegFromStackSlot(*MBB, I, CSI[i].getReg(),
+          unsigned Reg = CSI[i].getReg();
+          const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+          TII.loadRegFromStackSlot(*MBB, I, Reg,
                                    CSI[i].getFrameIdx(),
-                                   CSI[i].getRegClass(), TRI);
+                                   RC, TRI);
           assert(I != MBB->begin() &&
                  "loadRegFromStackSlot didn't insert any code!");
           // Insert in reverse order.  loadRegFromStackSlot can insert
@@ -374,10 +372,12 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
       MBB->addLiveIn(blockCSI[i].getReg());
 
       // Insert the spill to the stack frame.
-      TII.storeRegToStackSlot(*MBB, I, blockCSI[i].getReg(),
+      unsigned Reg = blockCSI[i].getReg();
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.storeRegToStackSlot(*MBB, I, Reg,
                               true,
                               blockCSI[i].getFrameIdx(),
-                              blockCSI[i].getRegClass(), TRI);
+                              RC, TRI);
     }
   }
 
@@ -423,9 +423,11 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
     // Restore all registers immediately before the return and any
     // terminators that preceed it.
     for (unsigned i = 0, e = blockCSI.size(); i != e; ++i) {
-      TII.loadRegFromStackSlot(*MBB, I, blockCSI[i].getReg(),
+      unsigned Reg = blockCSI[i].getReg();
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.loadRegFromStackSlot(*MBB, I, Reg,
                                blockCSI[i].getFrameIdx(),
-                               blockCSI[i].getRegClass(), TRI);
+                               RC, TRI);
       assert(I != MBB->begin() &&
              "loadRegFromStackSlot didn't insert any code!");
       // Insert in reverse order.  loadRegFromStackSlot can insert
@@ -639,6 +641,9 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) {
 
   for (MachineFunction::iterator BB = Fn.begin(),
          E = Fn.end(); BB != E; ++BB) {
+#ifndef NDEBUG
+    int SPAdjCount = 0; // frame setup / destroy count.
+#endif
     int SPAdj = 0;  // SP offset due to call frame setup / destroy.
     if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB);
 
@@ -646,6 +651,10 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) {
 
       if (I->getOpcode() == FrameSetupOpcode ||
           I->getOpcode() == FrameDestroyOpcode) {
+#ifndef NDEBUG
+        // Track whether we see even pairs of them
+        SPAdjCount += I->getOpcode() == FrameSetupOpcode ? 1 : -1;
+#endif
         // Remember how much SP has been adjusted to create the call
         // frame.
         int Size = I->getOperand(0).getImm();
@@ -712,7 +721,13 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) {
       if (RS && !FrameIndexVirtualScavenging && MI) RS->forward(MI);
     }
 
-    assert(SPAdj == 0 && "Unbalanced call frame setup / destroy pairs?");
+    // If we have evenly matched pairs of frame setup / destroy instructions,
+    // make sure the adjustments come out to zero. If we don't have matched
+    // pairs, we can't be sure the missing bit isn't in another basic block
+    // due to a custom inserter playing tricks, so just asserting SPAdj==0
+    // isn't sufficient. See tMOVCC on Thumb1, for example.
+    assert((SPAdjCount || SPAdj == 0) &&
+           "Unbalanced call frame setup / destroy pairs?");
   }
 }
 
@@ -870,11 +885,7 @@ void PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
             // Scavenge a new scratch register
             CurrentVirtReg = Reg;
             const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(Reg);
-            CurrentScratchReg = RS->FindUnusedReg(RC);
-            if (CurrentScratchReg == 0)
-              // No register is "free". Scavenge a register.
-              CurrentScratchReg = RS->scavengeRegister(RC, I, SPAdj);
-
+            CurrentScratchReg = RS->scavengeRegister(RC, I, SPAdj);
             PrevValue = Value;
           }
           // replace this reference to the virtual register with the
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index b3b576070863..f44478e5dd0b 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -110,6 +110,11 @@ namespace {
     // Allocatable - vector of allocatable physical registers.
     BitVector Allocatable;
 
+    // SkippedInstrs - Descriptors of instructions whose clobber list was ignored
+    // because all registers were spilled. It is still necessary to mark all the
+    // clobbered registers as used by the function.
+    SmallPtrSet<const TargetInstrDesc*, 4> SkippedInstrs;
+
     // isBulkSpilling - This flag is set when LiveRegMap will be cleared
     // completely after spilling all live registers. LiveRegMap entries should
     // not be erased.
@@ -135,6 +140,8 @@ namespace {
   private:
     bool runOnMachineFunction(MachineFunction &Fn);
     void AllocateBasicBlock();
+    void handleThroughOperands(MachineInstr *MI,
+                               SmallVectorImpl<unsigned> &VirtDead);
     int getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC);
     bool isLastUseOfLocalReg(MachineOperand&);
 
@@ -508,27 +515,20 @@ RAFast::defineVirtReg(MachineInstr *MI, unsigned OpNum,
   bool New;
   tie(LRI, New) = LiveVirtRegs.insert(std::make_pair(VirtReg, LiveReg()));
   LiveReg &LR = LRI->second;
-  bool PartialRedef = MI->getOperand(OpNum).getSubReg();
   if (New) {
     // If there is no hint, peek at the only use of this register.
     if ((!Hint || !TargetRegisterInfo::isPhysicalRegister(Hint)) &&
         MRI->hasOneNonDBGUse(VirtReg)) {
+      const MachineInstr &UseMI = *MRI->use_nodbg_begin(VirtReg);
       unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
       // It's a copy, use the destination register as a hint.
-      if (TII->isMoveInstr(*MRI->use_nodbg_begin(VirtReg),
-                           SrcReg, DstReg, SrcSubReg, DstSubReg))
+      if (UseMI.isCopyLike())
+        Hint = UseMI.getOperand(0).getReg();
+      else if (TII->isMoveInstr(UseMI, SrcReg, DstReg, SrcSubReg, DstSubReg))
         Hint = DstReg;
     }
     allocVirtReg(MI, *LRI, Hint);
-    // If this is only a partial redefinition, we must reload the other parts.
-    if (PartialRedef && MI->readsVirtualRegister(VirtReg)) {
-      const TargetRegisterClass *RC = MRI->getRegClass(VirtReg);
-      int FI = getStackSpaceFor(VirtReg, RC);
-      DEBUG(dbgs() << "Reloading for partial redef: %reg" << VirtReg << "\n");
-      TII->loadRegFromStackSlot(*MBB, MI, LR.PhysReg, FI, RC, TRI);
-      ++NumLoads;
-    }
-  } else if (LR.LastUse && !PartialRedef) {
+  } else if (LR.LastUse) {
     // Redefining a live register - kill at the last use, unless it is this
     // instruction defining VirtReg multiple times.
     if (LR.LastUse != MI || LR.LastUse->getOperand(LR.LastOpNum).isUse())
@@ -564,10 +564,16 @@ RAFast::reloadVirtReg(MachineInstr *MI, unsigned OpNum,
   } else if (LR.Dirty) {
     if (isLastUseOfLocalReg(MO)) {
       DEBUG(dbgs() << "Killing last use: " << MO << "\n");
-      MO.setIsKill();
+      if (MO.isUse())
+        MO.setIsKill();
+      else
+        MO.setIsDead();
     } else if (MO.isKill()) {
       DEBUG(dbgs() << "Clearing dubious kill: " << MO << "\n");
       MO.setIsKill(false);
+    } else if (MO.isDead()) {
+      DEBUG(dbgs() << "Clearing dubious dead: " << MO << "\n");
+      MO.setIsDead(false);
     }
   } else if (MO.isKill()) {
     // We must remove kill flags from uses of reloaded registers because the
@@ -576,6 +582,9 @@ RAFast::reloadVirtReg(MachineInstr *MI, unsigned OpNum,
     // This would cause a second reload of %x into a different register.
     DEBUG(dbgs() << "Clearing clean kill: " << MO << "\n");
     MO.setIsKill(false);
+  } else if (MO.isDead()) {
+    DEBUG(dbgs() << "Clearing clean dead: " << MO << "\n");
+    MO.setIsDead(false);
   }
   assert(LR.PhysReg && "Register not assigned");
   LR.LastUse = MI;
@@ -607,6 +616,91 @@ bool RAFast::setPhysReg(MachineInstr *MI, unsigned OpNum, unsigned PhysReg) {
   return MO.isDead();
 }
 
+// Handle special instruction operand like early clobbers and tied ops when
+// there are additional physreg defines.
+void RAFast::handleThroughOperands(MachineInstr *MI,
+                                   SmallVectorImpl<unsigned> &VirtDead) {
+  DEBUG(dbgs() << "Scanning for through registers:");
+  SmallSet<unsigned, 8> ThroughRegs;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    if (MO.isEarlyClobber() || MI->isRegTiedToDefOperand(i) ||
+        (MO.getSubReg() && MI->readsVirtualRegister(Reg))) {
+      if (ThroughRegs.insert(Reg))
+        DEBUG(dbgs() << " %reg" << Reg);
+    }
+  }
+
+  // If any physreg defines collide with preallocated through registers,
+  // we must spill and reallocate.
+  DEBUG(dbgs() << "\nChecking for physdef collisions.\n");
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    UsedInInstr.set(Reg);
+    if (ThroughRegs.count(PhysRegState[Reg]))
+      definePhysReg(MI, Reg, regFree);
+    for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS) {
+      UsedInInstr.set(*AS);
+      if (ThroughRegs.count(PhysRegState[*AS]))
+        definePhysReg(MI, *AS, regFree);
+    }
+  }
+
+  SmallVector<unsigned, 8> PartialDefs;
+  DEBUG(dbgs() << "Allocating tied uses and early clobbers.\n");
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    if (MO.isUse()) {
+      unsigned DefIdx = 0;
+      if (!MI->isRegTiedToDefOperand(i, &DefIdx)) continue;
+      DEBUG(dbgs() << "Operand " << i << "("<< MO << ") is tied to operand "
+        << DefIdx << ".\n");
+      LiveRegMap::iterator LRI = reloadVirtReg(MI, i, Reg, 0);
+      unsigned PhysReg = LRI->second.PhysReg;
+      setPhysReg(MI, i, PhysReg);
+      // Note: we don't update the def operand yet. That would cause the normal
+      // def-scan to attempt spilling.
+    } else if (MO.getSubReg() && MI->readsVirtualRegister(Reg)) {
+      DEBUG(dbgs() << "Partial redefine: " << MO << "\n");
+      // Reload the register, but don't assign to the operand just yet.
+      // That would confuse the later phys-def processing pass.
+      LiveRegMap::iterator LRI = reloadVirtReg(MI, i, Reg, 0);
+      PartialDefs.push_back(LRI->second.PhysReg);
+    } else if (MO.isEarlyClobber()) {
+      // Note: defineVirtReg may invalidate MO.
+      LiveRegMap::iterator LRI = defineVirtReg(MI, i, Reg, 0);
+      unsigned PhysReg = LRI->second.PhysReg;
+      if (setPhysReg(MI, i, PhysReg))
+        VirtDead.push_back(Reg);
+    }
+  }
+
+  // Restore UsedInInstr to a state usable for allocating normal virtual uses.
+  UsedInInstr.reset();
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || (MO.isDef() && !MO.isEarlyClobber())) continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    UsedInInstr.set(Reg);
+    for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+      UsedInInstr.set(*AS);
+  }
+
+  // Also mark PartialDefs as used to avoid reallocation.
+  for (unsigned i = 0, e = PartialDefs.size(); i != e; ++i)
+    UsedInInstr.set(PartialDefs[i]);
+}
+
 void RAFast::AllocateBasicBlock() {
   DEBUG(dbgs() << "\nAllocating " << *MBB);
 
@@ -620,7 +714,7 @@ void RAFast::AllocateBasicBlock() {
          E = MBB->livein_end(); I != E; ++I)
     definePhysReg(MII, *I, regReserved);
 
-  SmallVector<unsigned, 8> PhysECs, VirtDead;
+  SmallVector<unsigned, 8> VirtDead;
   SmallVector<MachineInstr*, 32> Coalesced;
 
   // Otherwise, sequentially allocate each instruction in the MBB.
@@ -670,8 +764,25 @@ void RAFast::AllocateBasicBlock() {
         LiveRegMap::iterator LRI = LiveVirtRegs.find(Reg);
         if (LRI != LiveVirtRegs.end())
           setPhysReg(MI, i, LRI->second.PhysReg);
-        else
-          MO.setReg(0); // We can't allocate a physreg for a DebugValue, sorry!
+        else {
+          int SS = StackSlotForVirtReg[Reg];
+          if (SS == -1)
+            MO.setReg(0); // We can't allocate a physreg for a DebugValue, sorry!
+          else {
+            // Modify DBG_VALUE now that the value is in a spill slot.
+            uint64_t Offset = MI->getOperand(1).getImm();
+            const MDNode *MDPtr = 
+              MI->getOperand(MI->getNumOperands()-1).getMetadata();
+            DebugLoc DL = MI->getDebugLoc();
+            if (MachineInstr *NewDV = 
+                TII->emitFrameIndexDebugValue(*MF, SS, Offset, MDPtr, DL)) {
+              DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << *MI);
+              MachineBasicBlock *MBB = MI->getParent();
+              MBB->insert(MBB->erase(MI), NewDV);
+            } else
+              MO.setReg(0); // We can't allocate a physreg for a DebugValue, sorry!
+          }
+        }
       }
       // Next instruction.
       continue;
@@ -679,17 +790,25 @@ void RAFast::AllocateBasicBlock() {
 
     // If this is a copy, we may be able to coalesce.
     unsigned CopySrc, CopyDst, CopySrcSub, CopyDstSub;
-    if (!TII->isMoveInstr(*MI, CopySrc, CopyDst, CopySrcSub, CopyDstSub))
+    if (MI->isCopy()) {
+      CopyDst = MI->getOperand(0).getReg();
+      CopySrc = MI->getOperand(1).getReg();
+      CopyDstSub = MI->getOperand(0).getSubReg();
+      CopySrcSub = MI->getOperand(1).getSubReg();
+    } else if (!TII->isMoveInstr(*MI, CopySrc, CopyDst, CopySrcSub, CopyDstSub))
       CopySrc = CopyDst = 0;
 
     // Track registers used by instruction.
     UsedInInstr.reset();
-    PhysECs.clear();
 
     // First scan.
     // Mark physreg uses and early clobbers as used.
     // Find the end of the virtreg operands
     unsigned VirtOpEnd = 0;
+    bool hasTiedOps = false;
+    bool hasEarlyClobbers = false;
+    bool hasPartialRedefs = false;
+    bool hasPhysDefs = false;
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
       MachineOperand &MO = MI->getOperand(i);
       if (!MO.isReg()) continue;
@@ -697,20 +816,44 @@ void RAFast::AllocateBasicBlock() {
       if (!Reg) continue;
       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
         VirtOpEnd = i+1;
+        if (MO.isUse()) {
+          hasTiedOps = hasTiedOps ||
+                                TID.getOperandConstraint(i, TOI::TIED_TO) != -1;
+        } else {
+          if (MO.isEarlyClobber())
+            hasEarlyClobbers = true;
+          if (MO.getSubReg() && MI->readsVirtualRegister(Reg))
+            hasPartialRedefs = true;
+        }
         continue;
       }
       if (!Allocatable.test(Reg)) continue;
       if (MO.isUse()) {
         usePhysReg(MO);
       } else if (MO.isEarlyClobber()) {
-        definePhysReg(MI, Reg, MO.isDead() ? regFree : regReserved);
-        PhysECs.push_back(Reg);
-      }
+        definePhysReg(MI, Reg, (MO.isImplicit() || MO.isDead()) ?
+                               regFree : regReserved);
+        hasEarlyClobbers = true;
+      } else
+        hasPhysDefs = true;
+    }
+
+    // The instruction may have virtual register operands that must be allocated
+    // the same register at use-time and def-time: early clobbers and tied
+    // operands. If there are also physical defs, these registers must avoid
+    // both physical defs and uses, making them more constrained than normal
+    // operands.
+    // We didn't detect inline asm tied operands above, so just make this extra
+    // pass for all inline asm.
+    if (MI->isInlineAsm() || hasEarlyClobbers || hasPartialRedefs ||
+        (hasTiedOps && hasPhysDefs)) {
+      handleThroughOperands(MI, VirtDead);
+      // Don't attempt coalescing when we have funny stuff going on.
+      CopyDst = 0;
     }
 
     // Second scan.
-    // Allocate virtreg uses and early clobbers.
-    // Collect VirtKills
+    // Allocate virtreg uses.
     for (unsigned i = 0; i != VirtOpEnd; ++i) {
       MachineOperand &MO = MI->getOperand(i);
       if (!MO.isReg()) continue;
@@ -722,12 +865,6 @@ void RAFast::AllocateBasicBlock() {
         CopySrc = (CopySrc == Reg || CopySrc == PhysReg) ? PhysReg : 0;
         if (setPhysReg(MI, i, PhysReg))
           killVirtReg(LRI);
-      } else if (MO.isEarlyClobber()) {
-        // Note: defineVirtReg may invalidate MO.
-        LiveRegMap::iterator LRI = defineVirtReg(MI, i, Reg, 0);
-        unsigned PhysReg = LRI->second.PhysReg;
-        setPhysReg(MI, i, PhysReg);
-        PhysECs.push_back(PhysReg);
       }
     }
 
@@ -735,12 +872,16 @@ void RAFast::AllocateBasicBlock() {
 
     // Track registers defined by instruction - early clobbers at this point.
     UsedInInstr.reset();
-    for (unsigned i = 0, e = PhysECs.size(); i != e; ++i) {
-      unsigned PhysReg = PhysECs[i];
-      UsedInInstr.set(PhysReg);
-      for (const unsigned *AS = TRI->getAliasSet(PhysReg);
-            unsigned Alias = *AS; ++AS)
-        UsedInInstr.set(Alias);
+    if (hasEarlyClobbers) {
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (!MO.isReg() || !MO.isDef()) continue;
+        unsigned Reg = MO.getReg();
+        if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+        UsedInInstr.set(Reg);
+        for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+          UsedInInstr.set(*AS);
+      }
     }
 
     unsigned DefOpEnd = MI->getNumOperands();
@@ -752,13 +893,18 @@ void RAFast::AllocateBasicBlock() {
       DefOpEnd = VirtOpEnd;
       DEBUG(dbgs() << "  Spilling remaining registers before call.\n");
       spillAll(MI);
+
+      // The imp-defs are skipped below, but we still need to mark those
+      // registers as used by the function.
+      SkippedInstrs.insert(&TID);
     }
 
     // Third scan.
     // Allocate defs and collect dead defs.
     for (unsigned i = 0; i != DefOpEnd; ++i) {
       MachineOperand &MO = MI->getOperand(i);
-      if (!MO.isReg() || !MO.isDef() || !MO.getReg()) continue;
+      if (!MO.isReg() || !MO.isDef() || !MO.getReg() || MO.isEarlyClobber())
+        continue;
       unsigned Reg = MO.getReg();
 
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
@@ -837,6 +983,14 @@ bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
   // Make sure the set of used physregs is closed under subreg operations.
   MRI->closePhysRegsUsed(*TRI);
 
+  // Add the clobber lists for all the instructions we skipped earlier.
+  for (SmallPtrSet<const TargetInstrDesc*, 4>::const_iterator
+       I = SkippedInstrs.begin(), E = SkippedInstrs.end(); I != E; ++I)
+    if (const unsigned *Defs = (*I)->getImplicitDefs())
+      while (*Defs)
+        MRI->setPhysRegUsed(*Defs++);
+
+  SkippedInstrs.clear();
   StackSlotForVirtReg.clear();
   return true;
 }
diff --git a/lib/CodeGen/RegAllocLinearScan.cpp b/lib/CodeGen/RegAllocLinearScan.cpp
index bc331f0ff81d..044672d6d7a5 100644
--- a/lib/CodeGen/RegAllocLinearScan.cpp
+++ b/lib/CodeGen/RegAllocLinearScan.cpp
@@ -83,7 +83,8 @@ namespace {
   // pressure, it can caused fewer GPRs to be held in the queue.
   static cl::opt<unsigned>
   NumRecentlyUsedRegs("linearscan-skip-count",
-                      cl::desc("Number of registers for linearscan to remember to skip."),
+                      cl::desc("Number of registers for linearscan to remember"
+                               "to skip."),
                       cl::init(0),
                       cl::Hidden);
  
@@ -421,9 +422,10 @@ unsigned RALinScan::attemptTrivialCoalescing(LiveInterval &cur, unsigned Reg) {
     unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
     if (vni->def != SlotIndex() && vni->isDefAccurate() &&
         (CopyMI = li_->getInstructionFromIndex(vni->def)) &&
-        tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubReg, DstSubReg))
+        (CopyMI->isCopy() ||
+         tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubReg, DstSubReg)))
       // Defined by a copy, try to extend SrcReg forward
-      CandReg = SrcReg;
+      CandReg = CopyMI->isCopy() ? CopyMI->getOperand(1).getReg() : SrcReg;
     else if (TrivCoalesceEnds &&
              (CopyMI =
               li_->getInstructionFromIndex(range.end.getBaseIndex())) &&
@@ -992,6 +994,24 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
           if (Reg && allocatableRegs_[Reg] && RC->contains(Reg))
             mri_->setRegAllocationHint(cur->reg, 0, Reg);
         }
+      } else if (CopyMI && CopyMI->isCopy()) {
+        DstReg = CopyMI->getOperand(0).getReg();
+        DstSubReg = CopyMI->getOperand(0).getSubReg();
+        SrcReg = CopyMI->getOperand(1).getReg();
+        SrcSubReg = CopyMI->getOperand(1).getSubReg();
+        unsigned Reg = 0;
+        if (TargetRegisterInfo::isPhysicalRegister(SrcReg))
+          Reg = SrcReg;
+        else if (vrm_->isAssignedReg(SrcReg))
+          Reg = vrm_->getPhys(SrcReg);
+        if (Reg) {
+          if (SrcSubReg)
+            Reg = tri_->getSubReg(Reg, SrcSubReg);
+          if (DstSubReg)
+            Reg = tri_->getMatchingSuperReg(Reg, DstSubReg, RC);
+          if (Reg && allocatableRegs_[Reg] && RC->contains(Reg))
+            mri_->setRegAllocationHint(cur->reg, 0, Reg);
+        }
       }
     }
   }
@@ -1206,8 +1226,7 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
     DEBUG(dbgs() << "\t\t\tspilling(c): " << *cur << '\n');
     SmallVector<LiveInterval*, 8> spillIs;
     std::vector<LiveInterval*> added;
-    
-    added = spiller_->spill(cur, spillIs); 
+    spiller_->spill(cur, added, spillIs);
 
     std::sort(added.begin(), added.end(), LISorter());
     addStackInterval(cur, ls_, li_, mri_, *vrm_);
@@ -1285,10 +1304,8 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
     if (sli->beginIndex() < earliestStart)
       earliestStart = sli->beginIndex();
        
-    std::vector<LiveInterval*> newIs;
-    newIs = spiller_->spill(sli, spillIs, &earliestStart);
+    spiller_->spill(sli, added, spillIs, &earliestStart);
     addStackInterval(sli, ls_, li_, mri_, *vrm_);
-    std::copy(newIs.begin(), newIs.end(), std::back_inserter(added));
     spilled.insert(sli->reg);
   }
 
diff --git a/lib/CodeGen/RegAllocLocal.cpp b/lib/CodeGen/RegAllocLocal.cpp
deleted file mode 100644
index 321ae12def57..000000000000
--- a/lib/CodeGen/RegAllocLocal.cpp
+++ /dev/null
@@ -1,1254 +0,0 @@
-//===-- RegAllocLocal.cpp - A BasicBlock generic register allocator -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This register allocator allocates registers to a basic block at a time,
-// attempting to keep values in registers and reusing registers as appropriate.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "regalloc"
-#include "llvm/BasicBlock.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/RegAllocRegistry.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/IndexedMap.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
-#include <algorithm>
-using namespace llvm;
-
-STATISTIC(NumStores, "Number of stores added");
-STATISTIC(NumLoads , "Number of loads added");
-STATISTIC(NumCopies, "Number of copies coalesced");
-
-static RegisterRegAlloc
-  localRegAlloc("local", "local register allocator",
-                createLocalRegisterAllocator);
-
-namespace {
-  class RALocal : public MachineFunctionPass {
-  public:
-    static char ID;
-    RALocal() : MachineFunctionPass(&ID), StackSlotForVirtReg(-1) {}
-  private:
-    const TargetMachine *TM;
-    MachineFunction *MF;
-    MachineRegisterInfo *MRI;
-    const TargetRegisterInfo *TRI;
-    const TargetInstrInfo *TII;
-
-    // StackSlotForVirtReg - Maps virtual regs to the frame index where these
-    // values are spilled.
-    IndexedMap<int, VirtReg2IndexFunctor> StackSlotForVirtReg;
-
-    // Virt2PhysRegMap - This map contains entries for each virtual register
-    // that is currently available in a physical register.
-    IndexedMap<unsigned, VirtReg2IndexFunctor> Virt2PhysRegMap;
-
-    unsigned &getVirt2PhysRegMapSlot(unsigned VirtReg) {
-      return Virt2PhysRegMap[VirtReg];
-    }
-
-    // PhysRegsUsed - This array is effectively a map, containing entries for
-    // each physical register that currently has a value (ie, it is in
-    // Virt2PhysRegMap).  The value mapped to is the virtual register
-    // corresponding to the physical register (the inverse of the
-    // Virt2PhysRegMap), or 0.  The value is set to 0 if this register is pinned
-    // because it is used by a future instruction, and to -2 if it is not
-    // allocatable.  If the entry for a physical register is -1, then the
-    // physical register is "not in the map".
-    //
-    std::vector<int> PhysRegsUsed;
-
-    // PhysRegsUseOrder - This contains a list of the physical registers that
-    // currently have a virtual register value in them.  This list provides an
-    // ordering of registers, imposing a reallocation order.  This list is only
-    // used if all registers are allocated and we have to spill one, in which
-    // case we spill the least recently used register.  Entries at the front of
-    // the list are the least recently used registers, entries at the back are
-    // the most recently used.
-    //
-    std::vector<unsigned> PhysRegsUseOrder;
-
-    // Virt2LastUseMap - This maps each virtual register to its last use
-    // (MachineInstr*, operand index pair).
-    IndexedMap<std::pair<MachineInstr*, unsigned>, VirtReg2IndexFunctor>
-    Virt2LastUseMap;
-
-    std::pair<MachineInstr*,unsigned>& getVirtRegLastUse(unsigned Reg) {
-      assert(TargetRegisterInfo::isVirtualRegister(Reg) && "Illegal VirtReg!");
-      return Virt2LastUseMap[Reg];
-    }
-
-    // VirtRegModified - This bitset contains information about which virtual
-    // registers need to be spilled back to memory when their registers are
-    // scavenged.  If a virtual register has simply been rematerialized, there
-    // is no reason to spill it to memory when we need the register back.
-    //
-    BitVector VirtRegModified;
-    
-    // UsedInMultipleBlocks - Tracks whether a particular register is used in
-    // more than one block.
-    BitVector UsedInMultipleBlocks;
-
-    void markVirtRegModified(unsigned Reg, bool Val = true) {
-      assert(TargetRegisterInfo::isVirtualRegister(Reg) && "Illegal VirtReg!");
-      Reg -= TargetRegisterInfo::FirstVirtualRegister;
-      if (Val)
-        VirtRegModified.set(Reg);
-      else
-        VirtRegModified.reset(Reg);
-    }
-
-    bool isVirtRegModified(unsigned Reg) const {
-      assert(TargetRegisterInfo::isVirtualRegister(Reg) && "Illegal VirtReg!");
-      assert(Reg - TargetRegisterInfo::FirstVirtualRegister <
-             VirtRegModified.size() && "Illegal virtual register!");
-      return VirtRegModified[Reg - TargetRegisterInfo::FirstVirtualRegister];
-    }
-
-    void AddToPhysRegsUseOrder(unsigned Reg) {
-      std::vector<unsigned>::iterator It =
-        std::find(PhysRegsUseOrder.begin(), PhysRegsUseOrder.end(), Reg);
-      if (It != PhysRegsUseOrder.end())
-        PhysRegsUseOrder.erase(It);
-      PhysRegsUseOrder.push_back(Reg);
-    }
-
-    void MarkPhysRegRecentlyUsed(unsigned Reg) {
-      if (PhysRegsUseOrder.empty() ||
-          PhysRegsUseOrder.back() == Reg) return;  // Already most recently used
-
-      for (unsigned i = PhysRegsUseOrder.size(); i != 0; --i) {
-        unsigned RegMatch = PhysRegsUseOrder[i-1];       // remove from middle
-        if (!areRegsEqual(Reg, RegMatch)) continue;
-        
-        PhysRegsUseOrder.erase(PhysRegsUseOrder.begin()+i-1);
-        // Add it to the end of the list
-        PhysRegsUseOrder.push_back(RegMatch);
-        if (RegMatch == Reg)
-          return;    // Found an exact match, exit early
-      }
-    }
-
-  public:
-    virtual const char *getPassName() const {
-      return "Local Register Allocator";
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
-      AU.addRequiredID(PHIEliminationID);
-      AU.addRequiredID(TwoAddressInstructionPassID);
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
-
-  private:
-    /// runOnMachineFunction - Register allocate the whole function
-    bool runOnMachineFunction(MachineFunction &Fn);
-
-    /// AllocateBasicBlock - Register allocate the specified basic block.
-    void AllocateBasicBlock(MachineBasicBlock &MBB);
-
-
-    /// areRegsEqual - This method returns true if the specified registers are
-    /// related to each other.  To do this, it checks to see if they are equal
-    /// or if the first register is in the alias set of the second register.
-    ///
-    bool areRegsEqual(unsigned R1, unsigned R2) const {
-      if (R1 == R2) return true;
-      for (const unsigned *AliasSet = TRI->getAliasSet(R2);
-           *AliasSet; ++AliasSet) {
-        if (*AliasSet == R1) return true;
-      }
-      return false;
-    }
-
-    /// getStackSpaceFor - This returns the frame index of the specified virtual
-    /// register on the stack, allocating space if necessary.
-    int getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC);
-
-    /// removePhysReg - This method marks the specified physical register as no
-    /// longer being in use.
-    ///
-    void removePhysReg(unsigned PhysReg);
-
-    void storeVirtReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                      unsigned VirtReg, unsigned PhysReg, bool isKill);
-
-    /// spillVirtReg - This method spills the value specified by PhysReg into
-    /// the virtual register slot specified by VirtReg.  It then updates the RA
-    /// data structures to indicate the fact that PhysReg is now available.
-    ///
-    void spillVirtReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                      unsigned VirtReg, unsigned PhysReg);
-
-    /// spillPhysReg - This method spills the specified physical register into
-    /// the virtual register slot associated with it.  If OnlyVirtRegs is set to
-    /// true, then the request is ignored if the physical register does not
-    /// contain a virtual register.
-    ///
-    void spillPhysReg(MachineBasicBlock &MBB, MachineInstr *I,
-                      unsigned PhysReg, bool OnlyVirtRegs = false);
-
-    /// assignVirtToPhysReg - This method updates local state so that we know
-    /// that PhysReg is the proper container for VirtReg now.  The physical
-    /// register must not be used for anything else when this is called.
-    ///
-    void assignVirtToPhysReg(unsigned VirtReg, unsigned PhysReg);
-
-    /// isPhysRegAvailable - Return true if the specified physical register is
-    /// free and available for use.  This also includes checking to see if
-    /// aliased registers are all free...
-    ///
-    bool isPhysRegAvailable(unsigned PhysReg) const;
-
-    /// getFreeReg - Look to see if there is a free register available in the
-    /// specified register class.  If not, return 0.
-    ///
-    unsigned getFreeReg(const TargetRegisterClass *RC);
-
-    /// getReg - Find a physical register to hold the specified virtual
-    /// register.  If all compatible physical registers are used, this method
-    /// spills the last used virtual register to the stack, and uses that
-    /// register. If NoFree is true, that means the caller knows there isn't
-    /// a free register, do not call getFreeReg().
-    unsigned getReg(MachineBasicBlock &MBB, MachineInstr *MI,
-                    unsigned VirtReg, bool NoFree = false);
-
-    /// reloadVirtReg - This method transforms the specified virtual
-    /// register use to refer to a physical register.  This method may do this
-    /// in one of several ways: if the register is available in a physical
-    /// register already, it uses that physical register.  If the value is not
-    /// in a physical register, and if there are physical registers available,
-    /// it loads it into a register: PhysReg if that is an available physical
-    /// register, otherwise any physical register of the right class.
-    /// If register pressure is high, and it is possible, it tries to fold the
-    /// load of the virtual register into the instruction itself.  It avoids
-    /// doing this if register pressure is low to improve the chance that
-    /// subsequent instructions can use the reloaded value.  This method
-    /// returns the modified instruction.
-    ///
-    MachineInstr *reloadVirtReg(MachineBasicBlock &MBB, MachineInstr *MI,
-                                unsigned OpNum, SmallSet<unsigned, 4> &RRegs,
-                                unsigned PhysReg);
-
-    /// ComputeLocalLiveness - Computes liveness of registers within a basic
-    /// block, setting the killed/dead flags as appropriate.
-    void ComputeLocalLiveness(MachineBasicBlock& MBB);
-
-    void reloadPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I,
-                       unsigned PhysReg);
-  };
-  char RALocal::ID = 0;
-}
-
-/// getStackSpaceFor - This allocates space for the specified virtual register
-/// to be held on the stack.
-int RALocal::getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC) {
-  // Find the location Reg would belong...
-  int SS = StackSlotForVirtReg[VirtReg];
-  if (SS != -1)
-    return SS;          // Already has space allocated?
-
-  // Allocate a new stack object for this spill location...
-  int FrameIdx = MF->getFrameInfo()->CreateSpillStackObject(RC->getSize(),
-                                                            RC->getAlignment());
-
-  // Assign the slot.
-  StackSlotForVirtReg[VirtReg] = FrameIdx;
-  return FrameIdx;
-}
-
-
-/// removePhysReg - This method marks the specified physical register as no
-/// longer being in use.
-///
-void RALocal::removePhysReg(unsigned PhysReg) {
-  PhysRegsUsed[PhysReg] = -1;      // PhyReg no longer used
-
-  std::vector<unsigned>::iterator It =
-    std::find(PhysRegsUseOrder.begin(), PhysRegsUseOrder.end(), PhysReg);
-  if (It != PhysRegsUseOrder.end())
-    PhysRegsUseOrder.erase(It);
-}
-
-/// storeVirtReg - Store a virtual register to its assigned stack slot.
-void RALocal::storeVirtReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator I,
-                           unsigned VirtReg, unsigned PhysReg,
-                           bool isKill) {
-  const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(VirtReg);
-  int FrameIndex = getStackSpaceFor(VirtReg, RC);
-  DEBUG(dbgs() << " to stack slot #" << FrameIndex);
-  TII->storeRegToStackSlot(MBB, I, PhysReg, isKill, FrameIndex, RC, TRI);
-  ++NumStores;   // Update statistics
-
-  // Mark the spill instruction as last use if we're not killing the register.
-  if (!isKill) {
-    MachineInstr *Spill = llvm::prior(I);
-    int OpNum = Spill->findRegisterUseOperandIdx(PhysReg);
-    if (OpNum < 0)
-      getVirtRegLastUse(VirtReg) = std::make_pair((MachineInstr*)0, 0);
-    else
-      getVirtRegLastUse(VirtReg) = std::make_pair(Spill, OpNum);
-  }
-}
-
-/// spillVirtReg - This method spills the value specified by PhysReg into the
-/// virtual register slot specified by VirtReg.  It then updates the RA data
-/// structures to indicate the fact that PhysReg is now available.
-///
-void RALocal::spillVirtReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator I,
-                           unsigned VirtReg, unsigned PhysReg) {
-  assert(VirtReg && "Spilling a physical register is illegal!"
-         " Must not have appropriate kill for the register or use exists beyond"
-         " the intended one.");
-  DEBUG(dbgs() << "  Spilling register " << TRI->getName(PhysReg)
-               << " containing %reg" << VirtReg);
-  
-  if (!isVirtRegModified(VirtReg)) {
-    DEBUG(dbgs() << " which has not been modified, so no store necessary!");
-    std::pair<MachineInstr*, unsigned> &LastUse = getVirtRegLastUse(VirtReg);
-    if (LastUse.first)
-      LastUse.first->getOperand(LastUse.second).setIsKill();
-  } else {
-    // Otherwise, there is a virtual register corresponding to this physical
-    // register.  We only need to spill it into its stack slot if it has been
-    // modified.
-    // If the instruction reads the register that's spilled, (e.g. this can
-    // happen if it is a move to a physical register), then the spill
-    // instruction is not a kill.
-    bool isKill = !(I != MBB.end() && I->readsRegister(PhysReg));
-    storeVirtReg(MBB, I, VirtReg, PhysReg, isKill);
-  }
-
-  getVirt2PhysRegMapSlot(VirtReg) = 0;   // VirtReg no longer available
-
-  DEBUG(dbgs() << '\n');
-  removePhysReg(PhysReg);
-}
-
-
-/// spillPhysReg - This method spills the specified physical register into the
-/// virtual register slot associated with it.  If OnlyVirtRegs is set to true,
-/// then the request is ignored if the physical register does not contain a
-/// virtual register.
-///
-void RALocal::spillPhysReg(MachineBasicBlock &MBB, MachineInstr *I,
-                           unsigned PhysReg, bool OnlyVirtRegs) {
-  if (PhysRegsUsed[PhysReg] != -1) {            // Only spill it if it's used!
-    assert(PhysRegsUsed[PhysReg] != -2 && "Non allocable reg used!");
-    if (PhysRegsUsed[PhysReg] || !OnlyVirtRegs)
-      spillVirtReg(MBB, I, PhysRegsUsed[PhysReg], PhysReg);
-    return;
-  }
-  
-  // If the selected register aliases any other registers, we must make
-  // sure that one of the aliases isn't alive.
-  for (const unsigned *AliasSet = TRI->getAliasSet(PhysReg);
-       *AliasSet; ++AliasSet) {
-    if (PhysRegsUsed[*AliasSet] == -1 ||     // Spill aliased register.
-        PhysRegsUsed[*AliasSet] == -2)       // If allocatable.
-      continue;
-  
-    if (PhysRegsUsed[*AliasSet])
-      spillVirtReg(MBB, I, PhysRegsUsed[*AliasSet], *AliasSet);
-  }
-}
-
-
-/// assignVirtToPhysReg - This method updates local state so that we know
-/// that PhysReg is the proper container for VirtReg now.  The physical
-/// register must not be used for anything else when this is called.
-///
-void RALocal::assignVirtToPhysReg(unsigned VirtReg, unsigned PhysReg) {
-  assert(PhysRegsUsed[PhysReg] == -1 && "Phys reg already assigned!");
-  // Update information to note the fact that this register was just used, and
-  // it holds VirtReg.
-  PhysRegsUsed[PhysReg] = VirtReg;
-  getVirt2PhysRegMapSlot(VirtReg) = PhysReg;
-  AddToPhysRegsUseOrder(PhysReg);   // New use of PhysReg
-}
-
-
-/// isPhysRegAvailable - Return true if the specified physical register is free
-/// and available for use.  This also includes checking to see if aliased
-/// registers are all free...
-///
-bool RALocal::isPhysRegAvailable(unsigned PhysReg) const {
-  if (PhysRegsUsed[PhysReg] != -1) return false;
-
-  // If the selected register aliases any other allocated registers, it is
-  // not free!
-  for (const unsigned *AliasSet = TRI->getAliasSet(PhysReg);
-       *AliasSet; ++AliasSet)
-    if (PhysRegsUsed[*AliasSet] >= 0) // Aliased register in use?
-      return false;                    // Can't use this reg then.
-  return true;
-}
-
-
-/// getFreeReg - Look to see if there is a free register available in the
-/// specified register class.  If not, return 0.
-///
-unsigned RALocal::getFreeReg(const TargetRegisterClass *RC) {
-  // Get iterators defining the range of registers that are valid to allocate in
-  // this class, which also specifies the preferred allocation order.
-  TargetRegisterClass::iterator RI = RC->allocation_order_begin(*MF);
-  TargetRegisterClass::iterator RE = RC->allocation_order_end(*MF);
-
-  for (; RI != RE; ++RI)
-    if (isPhysRegAvailable(*RI)) {       // Is reg unused?
-      assert(*RI != 0 && "Cannot use register!");
-      return *RI; // Found an unused register!
-    }
-  return 0;
-}
-
-
-/// getReg - Find a physical register to hold the specified virtual
-/// register.  If all compatible physical registers are used, this method spills
-/// the last used virtual register to the stack, and uses that register.
-///
-unsigned RALocal::getReg(MachineBasicBlock &MBB, MachineInstr *I,
-                         unsigned VirtReg, bool NoFree) {
-  const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(VirtReg);
-
-  // First check to see if we have a free register of the requested type...
-  unsigned PhysReg = NoFree ? 0 : getFreeReg(RC);
-
-  if (PhysReg != 0) {
-    // Assign the register.
-    assignVirtToPhysReg(VirtReg, PhysReg);
-    return PhysReg;
-  }    
-    
-  // If we didn't find an unused register, scavenge one now!
-  assert(!PhysRegsUseOrder.empty() && "No allocated registers??");
-
-  // Loop over all of the preallocated registers from the least recently used
-  // to the most recently used.  When we find one that is capable of holding
-  // our register, use it.
-  for (unsigned i = 0; PhysReg == 0; ++i) {
-    assert(i != PhysRegsUseOrder.size() &&
-           "Couldn't find a register of the appropriate class!");
-
-    unsigned R = PhysRegsUseOrder[i];
-
-    // We can only use this register if it holds a virtual register (ie, it
-    // can be spilled).  Do not use it if it is an explicitly allocated
-    // physical register!
-    assert(PhysRegsUsed[R] != -1 &&
-           "PhysReg in PhysRegsUseOrder, but is not allocated?");
-    if (PhysRegsUsed[R] && PhysRegsUsed[R] != -2) {
-      // If the current register is compatible, use it.
-      if (RC->contains(R)) {
-        PhysReg = R;
-        break;
-      }
-      
-      // If one of the registers aliased to the current register is
-      // compatible, use it.
-      for (const unsigned *AliasIt = TRI->getAliasSet(R);
-           *AliasIt; ++AliasIt) {
-        if (!RC->contains(*AliasIt)) continue;
-        
-        // If this is pinned down for some reason, don't use it.  For
-        // example, if CL is pinned, and we run across CH, don't use
-        // CH as justification for using scavenging ECX (which will
-        // fail).
-        if (PhysRegsUsed[*AliasIt] == 0) continue;
-            
-        // Make sure the register is allocatable.  Don't allocate SIL on
-        // x86-32.
-        if (PhysRegsUsed[*AliasIt] == -2) continue;
-        
-        PhysReg = *AliasIt;    // Take an aliased register
-        break;
-      }
-    }
-  }
-
-  assert(PhysReg && "Physical register not assigned!?!?");
-
-  // At this point PhysRegsUseOrder[i] is the least recently used register of
-  // compatible register class.  Spill it to memory and reap its remains.
-  spillPhysReg(MBB, I, PhysReg);
-
-  // Now that we know which register we need to assign this to, do it now!
-  assignVirtToPhysReg(VirtReg, PhysReg);
-  return PhysReg;
-}
-
-
-/// reloadVirtReg - This method transforms the specified virtual
-/// register use to refer to a physical register.  This method may do this in
-/// one of several ways: if the register is available in a physical register
-/// already, it uses that physical register.  If the value is not in a physical
-/// register, and if there are physical registers available, it loads it into a
-/// register: PhysReg if that is an available physical register, otherwise any
-/// register.  If register pressure is high, and it is possible, it tries to
-/// fold the load of the virtual register into the instruction itself.  It
-/// avoids doing this if register pressure is low to improve the chance that
-/// subsequent instructions can use the reloaded value.  This method returns
-/// the modified instruction.
-///
-MachineInstr *RALocal::reloadVirtReg(MachineBasicBlock &MBB, MachineInstr *MI,
-                                     unsigned OpNum,
-                                     SmallSet<unsigned, 4> &ReloadedRegs,
-                                     unsigned PhysReg) {
-  unsigned VirtReg = MI->getOperand(OpNum).getReg();
-  unsigned SubIdx = MI->getOperand(OpNum).getSubReg();
-
-  // If the virtual register is already available, just update the instruction
-  // and return.
-  if (unsigned PR = getVirt2PhysRegMapSlot(VirtReg)) {
-    if (SubIdx) {
-      PR = TRI->getSubReg(PR, SubIdx);
-      MI->getOperand(OpNum).setSubReg(0);
-    }
-    MI->getOperand(OpNum).setReg(PR);  // Assign the input register
-    if (!MI->isDebugValue()) {
-      // Do not do these for DBG_VALUE as they can affect codegen.
-      MarkPhysRegRecentlyUsed(PR);       // Already have this value available!
-      getVirtRegLastUse(VirtReg) = std::make_pair(MI, OpNum);
-    }
-    return MI;
-  }
-
-  // Otherwise, we need to fold it into the current instruction, or reload it.
-  // If we have registers available to hold the value, use them.
-  const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(VirtReg);
-  // If we already have a PhysReg (this happens when the instruction is a
-  // reg-to-reg copy with a PhysReg destination) use that.
-  if (!PhysReg || !TargetRegisterInfo::isPhysicalRegister(PhysReg) ||
-      !isPhysRegAvailable(PhysReg))
-    PhysReg = getFreeReg(RC);
-  int FrameIndex = getStackSpaceFor(VirtReg, RC);
-
-  if (PhysReg) {   // Register is available, allocate it!
-    assignVirtToPhysReg(VirtReg, PhysReg);
-  } else {         // No registers available.
-    // Force some poor hapless value out of the register file to
-    // make room for the new register, and reload it.
-    PhysReg = getReg(MBB, MI, VirtReg, true);
-  }
-
-  markVirtRegModified(VirtReg, false);   // Note that this reg was just reloaded
-
-  DEBUG(dbgs() << "  Reloading %reg" << VirtReg << " into "
-               << TRI->getName(PhysReg) << "\n");
-
-  // Add move instruction(s)
-  TII->loadRegFromStackSlot(MBB, MI, PhysReg, FrameIndex, RC, TRI);
-  ++NumLoads;    // Update statistics
-
-  MF->getRegInfo().setPhysRegUsed(PhysReg);
-  // Assign the input register.
-  if (SubIdx) {
-    MI->getOperand(OpNum).setSubReg(0);
-    MI->getOperand(OpNum).setReg(TRI->getSubReg(PhysReg, SubIdx));
-  } else
-    MI->getOperand(OpNum).setReg(PhysReg);  // Assign the input register
-  getVirtRegLastUse(VirtReg) = std::make_pair(MI, OpNum);
-
-  if (!ReloadedRegs.insert(PhysReg)) {
-    std::string msg;
-    raw_string_ostream Msg(msg);
-    Msg << "Ran out of registers during register allocation!";
-    if (MI->isInlineAsm()) {
-      Msg << "\nPlease check your inline asm statement for invalid "
-           << "constraints:\n";
-      MI->print(Msg, TM);
-    }
-    report_fatal_error(Msg.str());
-  }
-  for (const unsigned *SubRegs = TRI->getSubRegisters(PhysReg);
-       *SubRegs; ++SubRegs) {
-    if (ReloadedRegs.insert(*SubRegs)) continue;
-    
-    std::string msg;
-    raw_string_ostream Msg(msg);
-    Msg << "Ran out of registers during register allocation!";
-    if (MI->isInlineAsm()) {
-      Msg << "\nPlease check your inline asm statement for invalid "
-           << "constraints:\n";
-      MI->print(Msg, TM);
-    }
-    report_fatal_error(Msg.str());
-  }
-
-  return MI;
-}
-
-/// isReadModWriteImplicitKill - True if this is an implicit kill for a
-/// read/mod/write register, i.e. update partial register.
-static bool isReadModWriteImplicitKill(MachineInstr *MI, unsigned Reg) {
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
-    if (MO.isReg() && MO.getReg() == Reg && MO.isImplicit() &&
-        MO.isDef() && !MO.isDead())
-      return true;
-  }
-  return false;
-}
-
-/// isReadModWriteImplicitDef - True if this is an implicit def for a
-/// read/mod/write register, i.e. update partial register.
-static bool isReadModWriteImplicitDef(MachineInstr *MI, unsigned Reg) {
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
-    if (MO.isReg() && MO.getReg() == Reg && MO.isImplicit() &&
-        !MO.isDef() && MO.isKill())
-      return true;
-  }
-  return false;
-}
-
-// precedes - Helper function to determine with MachineInstr A
-// precedes MachineInstr B within the same MBB.
-static bool precedes(MachineBasicBlock::iterator A,
-                     MachineBasicBlock::iterator B) {
-  if (A == B)
-    return false;
-  
-  MachineBasicBlock::iterator I = A->getParent()->begin();
-  while (I != A->getParent()->end()) {
-    if (I == A)
-      return true;
-    else if (I == B)
-      return false;
-    
-    ++I;
-  }
-  
-  return false;
-}
-
-/// ComputeLocalLiveness - Computes liveness of registers within a basic
-/// block, setting the killed/dead flags as appropriate.
-void RALocal::ComputeLocalLiveness(MachineBasicBlock& MBB) {
-  // Keep track of the most recently seen previous use or def of each reg, 
-  // so that we can update them with dead/kill markers.
-  DenseMap<unsigned, std::pair<MachineInstr*, unsigned> > LastUseDef;
-  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-       I != E; ++I) {
-    if (I->isDebugValue())
-      continue;
-    
-    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = I->getOperand(i);
-      // Uses don't trigger any flags, but we need to save
-      // them for later.  Also, we have to process these
-      // _before_ processing the defs, since an instr
-      // uses regs before it defs them.
-      if (!MO.isReg() || !MO.getReg() || !MO.isUse())
-        continue;
-
-      // Ignore helpful kill flags from earlier passes.
-      MO.setIsKill(false);
-
-      LastUseDef[MO.getReg()] = std::make_pair(I, i);
-      
-      if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) continue;
-      
-      const unsigned *Aliases = TRI->getAliasSet(MO.getReg());
-      if (Aliases == 0)
-        continue;
-      
-      while (*Aliases) {
-        DenseMap<unsigned, std::pair<MachineInstr*, unsigned> >::iterator
-          alias = LastUseDef.find(*Aliases);
-        
-        if (alias != LastUseDef.end() && alias->second.first != I)
-          LastUseDef[*Aliases] = std::make_pair(I, i);
-        
-        ++Aliases;
-      }
-    }
-    
-    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = I->getOperand(i);
-      // Defs others than 2-addr redefs _do_ trigger flag changes:
-      //   - A def followed by a def is dead
-      //   - A use followed by a def is a kill
-      if (!MO.isReg() || !MO.getReg() || !MO.isDef()) continue;
-
-      unsigned SubIdx = MO.getSubReg();
-      DenseMap<unsigned, std::pair<MachineInstr*, unsigned> >::iterator
-        last = LastUseDef.find(MO.getReg());
-      if (last != LastUseDef.end()) {
-        // Check if this is a two address instruction.  If so, then
-        // the def does not kill the use.
-        if (last->second.first == I && I->isRegTiedToUseOperand(i))
-          continue;
-        
-        MachineOperand &lastUD =
-                    last->second.first->getOperand(last->second.second);
-        if (SubIdx && lastUD.getSubReg() != SubIdx)
-          // Partial re-def, the last def is not dead.
-          // %reg1024:5<def> =
-          // %reg1024:6<def> =
-          // or 
-          // %reg1024:5<def> = op %reg1024, 5
-          continue;
-
-        if (lastUD.isDef())
-          lastUD.setIsDead(true);
-        else
-          lastUD.setIsKill(true);
-      }
-      
-      LastUseDef[MO.getReg()] = std::make_pair(I, i);
-    }
-  }
-  
-  // Live-out (of the function) registers contain return values of the function,
-  // so we need to make sure they are alive at return time.
-  MachineBasicBlock::iterator Ret = MBB.getFirstTerminator();
-  bool BBEndsInReturn = (Ret != MBB.end() && Ret->getDesc().isReturn());
-
-  if (BBEndsInReturn)
-    for (MachineRegisterInfo::liveout_iterator
-         I = MF->getRegInfo().liveout_begin(),
-         E = MF->getRegInfo().liveout_end(); I != E; ++I)
-      if (!Ret->readsRegister(*I)) {
-        Ret->addOperand(MachineOperand::CreateReg(*I, false, true));
-        LastUseDef[*I] = std::make_pair(Ret, Ret->getNumOperands()-1);
-      }
-  
-  // Finally, loop over the final use/def of each reg 
-  // in the block and determine if it is dead.
-  for (DenseMap<unsigned, std::pair<MachineInstr*, unsigned> >::iterator
-       I = LastUseDef.begin(), E = LastUseDef.end(); I != E; ++I) {
-    MachineInstr *MI = I->second.first;
-    unsigned idx = I->second.second;
-    MachineOperand &MO = MI->getOperand(idx);
-    
-    bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(MO.getReg());
-    
-    // A crude approximation of "live-out" calculation
-    bool usedOutsideBlock = isPhysReg ? false :   
-          UsedInMultipleBlocks.test(MO.getReg() -  
-                                    TargetRegisterInfo::FirstVirtualRegister);
-
-    // If the machine BB ends in a return instruction, then the value isn't used
-    // outside of the BB.
-    if (!isPhysReg && (!usedOutsideBlock || BBEndsInReturn)) {
-      // DBG_VALUE complicates this:  if the only refs of a register outside
-      // this block are DBG_VALUE, we can't keep the reg live just for that,
-      // as it will cause the reg to be spilled at the end of this block when
-      // it wouldn't have been otherwise.  Nullify the DBG_VALUEs when that
-      // happens.
-      bool UsedByDebugValueOnly = false;
-      for (MachineRegisterInfo::reg_iterator UI = MRI->reg_begin(MO.getReg()),
-             UE = MRI->reg_end(); UI != UE; ++UI) {
-        // Two cases:
-        // - used in another block
-        // - used in the same block before it is defined (loop)
-        if (UI->getParent() == &MBB &&
-            !(MO.isDef() && UI.getOperand().isUse() && precedes(&*UI, MI)))
-          continue;
-        
-        if (UI->isDebugValue()) {
-          UsedByDebugValueOnly = true;
-          continue;
-        }
-
-        // A non-DBG_VALUE use means we can leave DBG_VALUE uses alone.
-        UsedInMultipleBlocks.set(MO.getReg() - 
-                                 TargetRegisterInfo::FirstVirtualRegister);
-        usedOutsideBlock = true;
-        UsedByDebugValueOnly = false;
-        break;
-      }
-
-      if (UsedByDebugValueOnly)
-        for (MachineRegisterInfo::reg_iterator UI = MRI->reg_begin(MO.getReg()),
-             UE = MRI->reg_end(); UI != UE; ++UI)
-          if (UI->isDebugValue() &&
-              (UI->getParent() != &MBB ||
-               (MO.isDef() && precedes(&*UI, MI))))
-            UI.getOperand().setReg(0U);
-    }
-  
-    // Physical registers and those that are not live-out of the block are
-    // killed/dead at their last use/def within this block.
-    if (isPhysReg || !usedOutsideBlock || BBEndsInReturn) {
-      if (MO.isUse()) {
-        // Don't mark uses that are tied to defs as kills.
-        if (!MI->isRegTiedToDefOperand(idx))
-          MO.setIsKill(true);
-      } else {
-        MO.setIsDead(true);
-      }
-    }
-  }
-}
-
-void RALocal::AllocateBasicBlock(MachineBasicBlock &MBB) {
-  // loop over each instruction
-  MachineBasicBlock::iterator MII = MBB.begin();
-  
-  DEBUG({
-      const BasicBlock *LBB = MBB.getBasicBlock();
-      if (LBB)
-        dbgs() << "\nStarting RegAlloc of BB: " << LBB->getName();
-    });
-
-  // Add live-in registers as active.
-  for (MachineBasicBlock::livein_iterator I = MBB.livein_begin(),
-         E = MBB.livein_end(); I != E; ++I) {
-    unsigned Reg = *I;
-    MF->getRegInfo().setPhysRegUsed(Reg);
-    PhysRegsUsed[Reg] = 0;            // It is free and reserved now
-    AddToPhysRegsUseOrder(Reg); 
-    for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
-         *SubRegs; ++SubRegs) {
-      if (PhysRegsUsed[*SubRegs] == -2) continue;
-      
-      AddToPhysRegsUseOrder(*SubRegs); 
-      PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
-      MF->getRegInfo().setPhysRegUsed(*SubRegs);
-    }
-  }
-  
-  ComputeLocalLiveness(MBB);
-  
-  // Otherwise, sequentially allocate each instruction in the MBB.
-  while (MII != MBB.end()) {
-    MachineInstr *MI = MII++;
-    const TargetInstrDesc &TID = MI->getDesc();
-    DEBUG({
-        dbgs() << "\nStarting RegAlloc of: " << *MI;
-        dbgs() << "  Regs have values: ";
-        for (unsigned i = 0; i != TRI->getNumRegs(); ++i)
-          if (PhysRegsUsed[i] != -1 && PhysRegsUsed[i] != -2) {
-            if (PhysRegsUsed[i] && isVirtRegModified(PhysRegsUsed[i]))
-              dbgs() << "*";
-            dbgs() << "[" << TRI->getName(i)
-                   << ",%reg" << PhysRegsUsed[i] << "] ";
-          }
-        dbgs() << '\n';
-      });
-
-    // Determine whether this is a copy instruction.  The cases where the
-    // source or destination are phys regs are handled specially.
-    unsigned SrcCopyReg, DstCopyReg, SrcCopySubReg, DstCopySubReg;
-    unsigned SrcCopyPhysReg = 0U;
-    bool isCopy = TII->isMoveInstr(*MI, SrcCopyReg, DstCopyReg, 
-                                   SrcCopySubReg, DstCopySubReg) &&
-      SrcCopySubReg == DstCopySubReg;
-    if (isCopy && TargetRegisterInfo::isVirtualRegister(SrcCopyReg))
-      SrcCopyPhysReg = getVirt2PhysRegMapSlot(SrcCopyReg);
-
-    // Loop over the implicit uses, making sure that they are at the head of the
-    // use order list, so they don't get reallocated.
-    if (TID.ImplicitUses) {
-      for (const unsigned *ImplicitUses = TID.ImplicitUses;
-           *ImplicitUses; ++ImplicitUses)
-        MarkPhysRegRecentlyUsed(*ImplicitUses);
-    }
-
-    SmallVector<unsigned, 8> Kills;
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
-      if (!MO.isReg() || !MO.isKill()) continue;
-      
-      if (!MO.isImplicit())
-        Kills.push_back(MO.getReg());
-      else if (!isReadModWriteImplicitKill(MI, MO.getReg()))
-        // These are extra physical register kills when a sub-register
-        // is defined (def of a sub-register is a read/mod/write of the
-        // larger registers). Ignore.
-        Kills.push_back(MO.getReg());
-    }
-
-    // If any physical regs are earlyclobber, spill any value they might
-    // have in them, then mark them unallocatable.
-    // If any virtual regs are earlyclobber, allocate them now (before
-    // freeing inputs that are killed).
-    if (MI->isInlineAsm()) {
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        MachineOperand &MO = MI->getOperand(i);
-        if (!MO.isReg() || !MO.isDef() || !MO.isEarlyClobber() ||
-            !MO.getReg())
-          continue;
-          
-        if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
-          unsigned DestVirtReg = MO.getReg();
-          unsigned DestPhysReg;
-
-          // If DestVirtReg already has a value, use it.
-          if (!(DestPhysReg = getVirt2PhysRegMapSlot(DestVirtReg)))
-            DestPhysReg = getReg(MBB, MI, DestVirtReg);
-          MF->getRegInfo().setPhysRegUsed(DestPhysReg);
-          markVirtRegModified(DestVirtReg);
-          getVirtRegLastUse(DestVirtReg) =
-                 std::make_pair((MachineInstr*)0, 0);
-          DEBUG(dbgs() << "  Assigning " << TRI->getName(DestPhysReg)
-                       << " to %reg" << DestVirtReg << "\n");
-          if (unsigned DestSubIdx = MO.getSubReg()) {
-            MO.setSubReg(0);
-            DestPhysReg = TRI->getSubReg(DestPhysReg, DestSubIdx);
-          }
-          MO.setReg(DestPhysReg);  // Assign the earlyclobber register
-        } else {
-          unsigned Reg = MO.getReg();
-          if (PhysRegsUsed[Reg] == -2) continue;  // Something like ESP.
-          // These are extra physical register defs when a sub-register
-          // is defined (def of a sub-register is a read/mod/write of the
-          // larger registers). Ignore.
-          if (isReadModWriteImplicitDef(MI, MO.getReg())) continue;
-
-          MF->getRegInfo().setPhysRegUsed(Reg);
-          spillPhysReg(MBB, MI, Reg, true); // Spill any existing value in reg
-          PhysRegsUsed[Reg] = 0;            // It is free and reserved now
-          AddToPhysRegsUseOrder(Reg); 
-
-          for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
-               *SubRegs; ++SubRegs) {
-            if (PhysRegsUsed[*SubRegs] == -2) continue;
-            MF->getRegInfo().setPhysRegUsed(*SubRegs);
-            PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
-            AddToPhysRegsUseOrder(*SubRegs); 
-          }
-        }
-      }
-    }
-
-    // If a DBG_VALUE says something is located in a spilled register,
-    // change the DBG_VALUE to be undef, which prevents the register
-    // from being reloaded here.  Doing that would change the generated
-    // code, unless another use immediately follows this instruction.
-    if (MI->isDebugValue() &&
-        MI->getNumOperands()==3 && MI->getOperand(0).isReg()) {
-      unsigned VirtReg = MI->getOperand(0).getReg();
-      if (VirtReg && TargetRegisterInfo::isVirtualRegister(VirtReg) &&
-          !getVirt2PhysRegMapSlot(VirtReg))
-        MI->getOperand(0).setReg(0U);
-    }
-
-    // Get the used operands into registers.  This has the potential to spill
-    // incoming values if we are out of registers.  Note that we completely
-    // ignore physical register uses here.  We assume that if an explicit
-    // physical register is referenced by the instruction, that it is guaranteed
-    // to be live-in, or the input is badly hosed.
-    //
-    SmallSet<unsigned, 4> ReloadedRegs;
-    for (unsigned i = 0; i != MI->getNumOperands(); ++i) {
-      MachineOperand &MO = MI->getOperand(i);
-      // here we are looking for only used operands (never def&use)
-      if (MO.isReg() && !MO.isDef() && MO.getReg() && !MO.isImplicit() &&
-          TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-        MI = reloadVirtReg(MBB, MI, i, ReloadedRegs,
-                           isCopy ? DstCopyReg : 0);
-    }
-
-    // If this instruction is the last user of this register, kill the
-    // value, freeing the register being used, so it doesn't need to be
-    // spilled to memory.
-    //
-    for (unsigned i = 0, e = Kills.size(); i != e; ++i) {
-      unsigned VirtReg = Kills[i];
-      unsigned PhysReg = VirtReg;
-      if (TargetRegisterInfo::isVirtualRegister(VirtReg)) {
-        // If the virtual register was never materialized into a register, it
-        // might not be in the map, but it won't hurt to zero it out anyway.
-        unsigned &PhysRegSlot = getVirt2PhysRegMapSlot(VirtReg);
-        PhysReg = PhysRegSlot;
-        PhysRegSlot = 0;
-      } else if (PhysRegsUsed[PhysReg] == -2) {
-        // Unallocatable register dead, ignore.
-        continue;
-      } else {
-        assert((!PhysRegsUsed[PhysReg] || PhysRegsUsed[PhysReg] == -1) &&
-               "Silently clearing a virtual register?");
-      }
-
-      if (!PhysReg) continue;
-      
-      DEBUG(dbgs() << "  Last use of " << TRI->getName(PhysReg)
-                   << "[%reg" << VirtReg <<"], removing it from live set\n");
-      removePhysReg(PhysReg);
-      for (const unsigned *SubRegs = TRI->getSubRegisters(PhysReg);
-           *SubRegs; ++SubRegs) {
-        if (PhysRegsUsed[*SubRegs] != -2) {
-          DEBUG(dbgs()  << "  Last use of "
-                        << TRI->getName(*SubRegs) << "[%reg" << VirtReg
-                        <<"], removing it from live set\n");
-          removePhysReg(*SubRegs);
-        }
-      }
-    }
-
-    // Loop over all of the operands of the instruction, spilling registers that
-    // are defined, and marking explicit destinations in the PhysRegsUsed map.
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
-      if (!MO.isReg() || !MO.isDef() || MO.isImplicit() || !MO.getReg() ||
-          MO.isEarlyClobber() ||
-          !TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
-        continue;
-      
-      unsigned Reg = MO.getReg();
-      if (PhysRegsUsed[Reg] == -2) continue;  // Something like ESP.
-      // These are extra physical register defs when a sub-register
-      // is defined (def of a sub-register is a read/mod/write of the
-      // larger registers). Ignore.
-      if (isReadModWriteImplicitDef(MI, MO.getReg())) continue;
-
-      MF->getRegInfo().setPhysRegUsed(Reg);
-      spillPhysReg(MBB, MI, Reg, true); // Spill any existing value in reg
-      PhysRegsUsed[Reg] = 0;            // It is free and reserved now
-      AddToPhysRegsUseOrder(Reg); 
-
-      for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
-           *SubRegs; ++SubRegs) {
-        if (PhysRegsUsed[*SubRegs] == -2) continue;
-        
-        MF->getRegInfo().setPhysRegUsed(*SubRegs);
-        PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
-        AddToPhysRegsUseOrder(*SubRegs); 
-      }
-    }
-
-    // Loop over the implicit defs, spilling them as well.
-    if (TID.ImplicitDefs) {
-      for (const unsigned *ImplicitDefs = TID.ImplicitDefs;
-           *ImplicitDefs; ++ImplicitDefs) {
-        unsigned Reg = *ImplicitDefs;
-        if (PhysRegsUsed[Reg] != -2) {
-          spillPhysReg(MBB, MI, Reg, true);
-          AddToPhysRegsUseOrder(Reg); 
-          PhysRegsUsed[Reg] = 0;            // It is free and reserved now
-        }
-        MF->getRegInfo().setPhysRegUsed(Reg);
-        for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
-             *SubRegs; ++SubRegs) {
-          if (PhysRegsUsed[*SubRegs] == -2) continue;
-          
-          AddToPhysRegsUseOrder(*SubRegs); 
-          PhysRegsUsed[*SubRegs] = 0;  // It is free and reserved now
-          MF->getRegInfo().setPhysRegUsed(*SubRegs);
-        }
-      }
-    }
-
-    SmallVector<unsigned, 8> DeadDefs;
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
-      if (MO.isReg() && MO.isDead())
-        DeadDefs.push_back(MO.getReg());
-    }
-
-    // Okay, we have allocated all of the source operands and spilled any values
-    // that would be destroyed by defs of this instruction.  Loop over the
-    // explicit defs and assign them to a register, spilling incoming values if
-    // we need to scavenge a register.
-    //
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
-      if (!MO.isReg() || !MO.isDef() || !MO.getReg() ||
-          MO.isEarlyClobber() ||
-          !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-        continue;
-      
-      unsigned DestVirtReg = MO.getReg();
-      unsigned DestPhysReg;
-
-      // If DestVirtReg already has a value, use it.
-      if (!(DestPhysReg = getVirt2PhysRegMapSlot(DestVirtReg))) {
-        // If this is a copy try to reuse the input as the output;
-        // that will make the copy go away.
-        // If this is a copy, the source reg is a phys reg, and
-        // that reg is available, use that phys reg for DestPhysReg.
-        // If this is a copy, the source reg is a virtual reg, and
-        // the phys reg that was assigned to that virtual reg is now
-        // available, use that phys reg for DestPhysReg.  (If it's now
-        // available that means this was the last use of the source.)
-        if (isCopy &&
-            TargetRegisterInfo::isPhysicalRegister(SrcCopyReg) &&
-            isPhysRegAvailable(SrcCopyReg)) {
-          DestPhysReg = SrcCopyReg;
-          assignVirtToPhysReg(DestVirtReg, DestPhysReg);
-        } else if (isCopy &&
-            TargetRegisterInfo::isVirtualRegister(SrcCopyReg) &&
-            SrcCopyPhysReg && isPhysRegAvailable(SrcCopyPhysReg) &&
-            MF->getRegInfo().getRegClass(DestVirtReg)->
-                             contains(SrcCopyPhysReg)) {
-          DestPhysReg = SrcCopyPhysReg;
-          assignVirtToPhysReg(DestVirtReg, DestPhysReg);
-        } else
-          DestPhysReg = getReg(MBB, MI, DestVirtReg);
-      }
-      MF->getRegInfo().setPhysRegUsed(DestPhysReg);
-      markVirtRegModified(DestVirtReg);
-      getVirtRegLastUse(DestVirtReg) = std::make_pair((MachineInstr*)0, 0);
-      DEBUG(dbgs() << "  Assigning " << TRI->getName(DestPhysReg)
-                   << " to %reg" << DestVirtReg << "\n");
-
-      if (unsigned DestSubIdx = MO.getSubReg()) {
-        MO.setSubReg(0);
-        DestPhysReg = TRI->getSubReg(DestPhysReg, DestSubIdx);
-      }
-      MO.setReg(DestPhysReg);  // Assign the output register
-    }
-
-    // If this instruction defines any registers that are immediately dead,
-    // kill them now.
-    //
-    for (unsigned i = 0, e = DeadDefs.size(); i != e; ++i) {
-      unsigned VirtReg = DeadDefs[i];
-      unsigned PhysReg = VirtReg;
-      if (TargetRegisterInfo::isVirtualRegister(VirtReg)) {
-        unsigned &PhysRegSlot = getVirt2PhysRegMapSlot(VirtReg);
-        PhysReg = PhysRegSlot;
-        assert(PhysReg != 0);
-        PhysRegSlot = 0;
-      } else if (PhysRegsUsed[PhysReg] == -2) {
-        // Unallocatable register dead, ignore.
-        continue;
-      } else if (!PhysReg)
-        continue;
-      
-      DEBUG(dbgs()  << "  Register " << TRI->getName(PhysReg)
-                    << " [%reg" << VirtReg
-                    << "] is never used, removing it from live set\n");
-      removePhysReg(PhysReg);
-      for (const unsigned *AliasSet = TRI->getAliasSet(PhysReg);
-           *AliasSet; ++AliasSet) {
-        if (PhysRegsUsed[*AliasSet] != -2) {
-          DEBUG(dbgs()  << "  Register " << TRI->getName(*AliasSet)
-                        << " [%reg" << *AliasSet
-                        << "] is never used, removing it from live set\n");
-          removePhysReg(*AliasSet);
-        }
-      }
-    }
-    
-    // If this instruction is a call, make sure there are no dirty registers. The
-    // call might throw an exception, and the landing pad expects to find all
-    // registers in stack slots.
-    if (TID.isCall())
-      for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i) {
-        if (PhysRegsUsed[i] <= 0) continue;
-        unsigned VirtReg = PhysRegsUsed[i];
-        if (!isVirtRegModified(VirtReg)) continue;
-        DEBUG(dbgs() << "  Storing dirty %reg" << VirtReg);
-        storeVirtReg(MBB, MI, VirtReg, i, false);
-        markVirtRegModified(VirtReg, false);
-        DEBUG(dbgs() << " because the call might throw\n");
-      }
-
-    // Finally, if this is a noop copy instruction, zap it.  (Except that if
-    // the copy is dead, it must be kept to avoid messing up liveness info for
-    // the register scavenger.  See pr4100.)
-    if (TII->isMoveInstr(*MI, SrcCopyReg, DstCopyReg,
-                         SrcCopySubReg, DstCopySubReg) &&
-        SrcCopyReg == DstCopyReg && SrcCopySubReg == DstCopySubReg &&
-        DeadDefs.empty()) {
-      ++NumCopies;
-      MBB.erase(MI);
-    }
-  }
-
-  MachineBasicBlock::iterator MI = MBB.getFirstTerminator();
-
-  // Spill all physical registers holding virtual registers now.
-  for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i)
-    if (PhysRegsUsed[i] != -1 && PhysRegsUsed[i] != -2) {
-      if (unsigned VirtReg = PhysRegsUsed[i])
-        spillVirtReg(MBB, MI, VirtReg, i);
-      else
-        removePhysReg(i);
-    }
-
-#if 0
-  // This checking code is very expensive.
-  bool AllOk = true;
-  for (unsigned i = TargetRegisterInfo::FirstVirtualRegister,
-           e = MF->getRegInfo().getLastVirtReg(); i <= e; ++i)
-    if (unsigned PR = Virt2PhysRegMap[i]) {
-      cerr << "Register still mapped: " << i << " -> " << PR << "\n";
-      AllOk = false;
-    }
-  assert(AllOk && "Virtual registers still in phys regs?");
-#endif
-
-  // Clear any physical register which appear live at the end of the basic
-  // block, but which do not hold any virtual registers.  e.g., the stack
-  // pointer.
-  PhysRegsUseOrder.clear();
-}
-
-/// runOnMachineFunction - Register allocate the whole function
-///
-bool RALocal::runOnMachineFunction(MachineFunction &Fn) {
-  DEBUG(dbgs() << "Machine Function\n");
-  MF = &Fn;
-  MRI = &Fn.getRegInfo();
-  TM = &Fn.getTarget();
-  TRI = TM->getRegisterInfo();
-  TII = TM->getInstrInfo();
-
-  PhysRegsUsed.assign(TRI->getNumRegs(), -1);
-  
-  // At various places we want to efficiently check to see whether a register
-  // is allocatable.  To handle this, we mark all unallocatable registers as
-  // being pinned down, permanently.
-  {
-    BitVector Allocable = TRI->getAllocatableSet(Fn);
-    for (unsigned i = 0, e = Allocable.size(); i != e; ++i)
-      if (!Allocable[i])
-        PhysRegsUsed[i] = -2;  // Mark the reg unallocable.
-  }
-
-  // initialize the virtual->physical register map to have a 'null'
-  // mapping for all virtual registers
-  unsigned LastVirtReg = MF->getRegInfo().getLastVirtReg();
-  StackSlotForVirtReg.grow(LastVirtReg);
-  Virt2PhysRegMap.grow(LastVirtReg);
-  Virt2LastUseMap.grow(LastVirtReg);
-  VirtRegModified.resize(LastVirtReg+1 -
-                         TargetRegisterInfo::FirstVirtualRegister);
-  UsedInMultipleBlocks.resize(LastVirtReg+1 -
-                              TargetRegisterInfo::FirstVirtualRegister);
- 
-  // Loop over all of the basic blocks, eliminating virtual register references
-  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
-       MBB != MBBe; ++MBB)
-    AllocateBasicBlock(*MBB);
-
-  StackSlotForVirtReg.clear();
-  PhysRegsUsed.clear();
-  VirtRegModified.clear();
-  UsedInMultipleBlocks.clear();
-  Virt2PhysRegMap.clear();
-  Virt2LastUseMap.clear();
-  return true;
-}
-
-FunctionPass *llvm::createLocalRegisterAllocator() {
-  return new RALocal();
-}
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index 4fafd2818a12..7e61a12a7eea 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -396,28 +396,23 @@ PBQPRegAlloc::CoalesceMap PBQPRegAlloc::findCoalesces() {
       if (srcRegIsPhysical && dstRegIsPhysical)
         continue;
 
-      // If it's a copy that includes a virtual register but the source and
-      // destination classes differ then we can't coalesce, so continue with
-      // the next instruction.
-      const TargetRegisterClass *srcRegClass = srcRegIsPhysical ?
-          tri->getPhysicalRegisterRegClass(srcReg) : mri->getRegClass(srcReg);
-
-      const TargetRegisterClass *dstRegClass = dstRegIsPhysical ?
-          tri->getPhysicalRegisterRegClass(dstReg) : mri->getRegClass(dstReg);
-
-      if (srcRegClass != dstRegClass)
+      // If it's a copy that includes two virtual register but the source and
+      // destination classes differ then we can't coalesce.
+      if (!srcRegIsPhysical && !dstRegIsPhysical &&
+          mri->getRegClass(srcReg) != mri->getRegClass(dstReg))
         continue;
 
-      // We also need any physical regs to be allocable, coalescing with
-      // a non-allocable register is invalid.
-      if (srcRegIsPhysical) {
+      // If one is physical and one is virtual, check that the physical is
+      // allocatable in the class of the virtual.
+      if (srcRegIsPhysical && !dstRegIsPhysical) {
+        const TargetRegisterClass *dstRegClass = mri->getRegClass(dstReg);
         if (std::find(dstRegClass->allocation_order_begin(*mf),
                       dstRegClass->allocation_order_end(*mf), srcReg) ==
             dstRegClass->allocation_order_end(*mf))
           continue;
       }
-
-      if (dstRegIsPhysical) {
+      if (!srcRegIsPhysical && dstRegIsPhysical) {
+        const TargetRegisterClass *srcRegClass = mri->getRegClass(srcReg);
         if (std::find(srcRegClass->allocation_order_begin(*mf),
                       srcRegClass->allocation_order_end(*mf), dstReg) ==
             srcRegClass->allocation_order_end(*mf))
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 1131e3db4e7d..ab0bc2d78a60 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -16,6 +16,8 @@
 #include "llvm/CodeGen/RegisterCoalescer.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Pass.h"
 
@@ -33,6 +35,160 @@ char RegisterCoalescer::ID = 0;
 //
 RegisterCoalescer::~RegisterCoalescer() {}
 
+unsigned CoalescerPair::compose(unsigned a, unsigned b) const {
+  if (!a) return b;
+  if (!b) return a;
+  return tri_.composeSubRegIndices(a, b);
+}
+
+bool CoalescerPair::isMoveInstr(const MachineInstr *MI,
+                                unsigned &Src, unsigned &Dst,
+                                unsigned &SrcSub, unsigned &DstSub) const {
+  if (MI->isCopy()) {
+    Dst = MI->getOperand(0).getReg();
+    DstSub = MI->getOperand(0).getSubReg();
+    Src = MI->getOperand(1).getReg();
+    SrcSub = MI->getOperand(1).getSubReg();
+  } else if (MI->isSubregToReg()) {
+    Dst = MI->getOperand(0).getReg();
+    DstSub = compose(MI->getOperand(0).getSubReg(), MI->getOperand(3).getImm());
+    Src = MI->getOperand(2).getReg();
+    SrcSub = MI->getOperand(2).getSubReg();
+  } else if (!tii_.isMoveInstr(*MI, Src, Dst, SrcSub, DstSub)) {
+    return false;
+  }
+  return true;
+}
+
+bool CoalescerPair::setRegisters(const MachineInstr *MI) {
+  srcReg_ = dstReg_ = subIdx_ = 0;
+  newRC_ = 0;
+  flipped_ = crossClass_ = false;
+
+  unsigned Src, Dst, SrcSub, DstSub;
+  if (!isMoveInstr(MI, Src, Dst, SrcSub, DstSub))
+    return false;
+  partial_ = SrcSub || DstSub;
+
+  // If one register is a physreg, it must be Dst.
+  if (TargetRegisterInfo::isPhysicalRegister(Src)) {
+    if (TargetRegisterInfo::isPhysicalRegister(Dst))
+      return false;
+    std::swap(Src, Dst);
+    std::swap(SrcSub, DstSub);
+    flipped_ = true;
+  }
+
+  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+
+  if (TargetRegisterInfo::isPhysicalRegister(Dst)) {
+    // Eliminate DstSub on a physreg.
+    if (DstSub) {
+      Dst = tri_.getSubReg(Dst, DstSub);
+      if (!Dst) return false;
+      DstSub = 0;
+    }
+
+    // Eliminate SrcSub by picking a corresponding Dst superregister.
+    if (SrcSub) {
+      Dst = tri_.getMatchingSuperReg(Dst, SrcSub, MRI.getRegClass(Src));
+      if (!Dst) return false;
+      SrcSub = 0;
+    } else if (!MRI.getRegClass(Src)->contains(Dst)) {
+      return false;
+    }
+  } else {
+    // Both registers are virtual.
+
+    // Both registers have subreg indices.
+    if (SrcSub && DstSub) {
+      // For now we only handle the case of identical indices in commensurate
+      // registers: Dreg:ssub_1 + Dreg:ssub_1 -> Dreg
+      // FIXME: Handle Qreg:ssub_3 + Dreg:ssub_1 as QReg:dsub_1 + Dreg.
+      if (SrcSub != DstSub)
+        return false;
+      const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
+      const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
+      if (!getCommonSubClass(DstRC, SrcRC))
+        return false;
+      SrcSub = DstSub = 0;
+    }
+
+    // There can be no SrcSub.
+    if (SrcSub) {
+      std::swap(Src, Dst);
+      DstSub = SrcSub;
+      SrcSub = 0;
+      assert(!flipped_ && "Unexpected flip");
+      flipped_ = true;
+    }
+
+    // Find the new register class.
+    const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
+    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
+    if (DstSub)
+      newRC_ = tri_.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
+    else
+      newRC_ = getCommonSubClass(DstRC, SrcRC);
+    if (!newRC_)
+      return false;
+    crossClass_ = newRC_ != DstRC || newRC_ != SrcRC;
+  }
+  // Check our invariants
+  assert(TargetRegisterInfo::isVirtualRegister(Src) && "Src must be virtual");
+  assert(!(TargetRegisterInfo::isPhysicalRegister(Dst) && DstSub) &&
+         "Cannot have a physical SubIdx");
+  srcReg_ = Src;
+  dstReg_ = Dst;
+  subIdx_ = DstSub;
+  return true;
+}
+
+bool CoalescerPair::flip() {
+  if (subIdx_ || TargetRegisterInfo::isPhysicalRegister(dstReg_))
+    return false;
+  std::swap(srcReg_, dstReg_);
+  flipped_ = !flipped_;
+  return true;
+}
+
+bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
+  if (!MI)
+    return false;
+  unsigned Src, Dst, SrcSub, DstSub;
+  if (!isMoveInstr(MI, Src, Dst, SrcSub, DstSub))
+    return false;
+
+  // Find the virtual register that is srcReg_.
+  if (Dst == srcReg_) {
+    std::swap(Src, Dst);
+    std::swap(SrcSub, DstSub);
+  } else if (Src != srcReg_) {
+    return false;
+  }
+
+  // Now check that Dst matches dstReg_.
+  if (TargetRegisterInfo::isPhysicalRegister(dstReg_)) {
+    if (!TargetRegisterInfo::isPhysicalRegister(Dst))
+      return false;
+    assert(!subIdx_ && "Inconsistent CoalescerPair state.");
+    // DstSub could be set for a physreg from INSERT_SUBREG.
+    if (DstSub)
+      Dst = tri_.getSubReg(Dst, DstSub);
+    // Full copy of Src.
+    if (!SrcSub)
+      return dstReg_ == Dst;
+    // This is a partial register copy. Check that the parts match.
+    return tri_.getSubReg(dstReg_, SrcSub) == Dst;
+  } else {
+    // dstReg_ is virtual.
+    if (dstReg_ != Dst)
+      return false;
+    // Registers match, do the subregisters line up?
+    return compose(subIdx_, SrcSub) == DstSub;
+  }
+}
+
 // Because of the way .a files work, we must force the SimpleRC
 // implementation to be pulled in if the RegisterCoalescer classes are
 // pulled in.  Otherwise we run the risk of RegisterCoalescer being
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 690e59f96ebc..43b3fb642635 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -141,6 +141,10 @@ void RegScavenger::forward() {
 
   // Find out which registers are early clobbered, killed, defined, and marked
   // def-dead in this instruction.
+  // FIXME: The scavenger is not predication aware. If the instruction is
+  // predicated, conservatively assume "kill" markers do not actually kill the
+  // register. Similarly ignores "dead" markers.
+  bool isPred = TII->isPredicated(MI);
   BitVector EarlyClobberRegs(NumPhysRegs);
   BitVector KillRegs(NumPhysRegs);
   BitVector DefRegs(NumPhysRegs);
@@ -155,11 +159,11 @@ void RegScavenger::forward() {
 
     if (MO.isUse()) {
       // Two-address operands implicitly kill.
-      if (MO.isKill() || MI->isRegTiedToDefOperand(i))
+      if (!isPred && (MO.isKill() || MI->isRegTiedToDefOperand(i)))
         addRegWithSubRegs(KillRegs, Reg);
     } else {
       assert(MO.isDef());
-      if (MO.isDead())
+      if (!isPred && MO.isDead())
         addRegWithSubRegs(DeadRegs, Reg);
       else
         addRegWithSubRegs(DefRegs, Reg);
@@ -238,8 +242,18 @@ unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
   return 0;
 }
 
+/// getRegsAvailable - Return all available registers in the register class
+/// in Mask.
+void RegScavenger::getRegsAvailable(const TargetRegisterClass *RC,
+                                    BitVector &Mask) {
+  for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
+       I != E; ++I)
+    if (!isAliasUsed(*I))
+      Mask.set(*I);
+}
+
 /// findSurvivorReg - Return the candidate register that is unused for the
-/// longest after MBBI. UseMI is set to the instruction where the search
+/// longest after StargMII. UseMI is set to the instruction where the search
 /// stopped.
 ///
 /// No more than InstrLimit instructions are inspected.
@@ -258,6 +272,10 @@ unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
 
   bool inVirtLiveRange = false;
   for (++MI; InstrLimit > 0 && MI != ME; ++MI, --InstrLimit) {
+    if (MI->isDebugValue()) {
+      ++InstrLimit; // Don't count debug instructions
+      continue;
+    }
     bool isVirtKillInsn = false;
     bool isVirtDefInsn = false;
     // Remove any candidates touched by instruction.
@@ -321,13 +339,16 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
       Candidates.reset(MO.getReg());
   }
 
+  // Try to find a register that's unused if there is one, as then we won't
+  // have to spill.
+  if ((Candidates & RegsAvailable).any())
+     Candidates &= RegsAvailable;
+
   // Find the register whose use is furthest away.
   MachineBasicBlock::iterator UseMI;
   unsigned SReg = findSurvivorReg(I, Candidates, 25, UseMI);
 
-  // If we found an unused register there is no reason to spill it. We have
-  // probably found a callee-saved register that has been saved in the
-  // prologue, but happens to be unused at this point.
+  // If we found an unused register there is no reason to spill it.
   if (!isAliasUsed(SReg))
     return SReg;
 
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index da20c12c360a..7d39dc496afe 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -380,26 +380,26 @@ void ScheduleDAG::VerifySchedule(bool isBottomUp) {
 }
 #endif
 
-/// InitDAGTopologicalSorting - create the initial topological 
+/// InitDAGTopologicalSorting - create the initial topological
 /// ordering from the DAG to be scheduled.
 ///
-/// The idea of the algorithm is taken from 
+/// The idea of the algorithm is taken from
 /// "Online algorithms for managing the topological order of
 /// a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly
-/// This is the MNR algorithm, which was first introduced by 
-/// A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in  
+/// This is the MNR algorithm, which was first introduced by
+/// A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in
 /// "Maintaining a topological order under edge insertions".
 ///
-/// Short description of the algorithm: 
+/// Short description of the algorithm:
 ///
 /// Topological ordering, ord, of a DAG maps each node to a topological
 /// index so that for all edges X->Y it is the case that ord(X) < ord(Y).
 ///
-/// This means that if there is a path from the node X to the node Z, 
+/// This means that if there is a path from the node X to the node Z,
 /// then ord(X) < ord(Z).
 ///
 /// This property can be used to check for reachability of nodes:
-/// if Z is reachable from X, then an insertion of the edge Z->X would 
+/// if Z is reachable from X, then an insertion of the edge Z->X would
 /// create a cycle.
 ///
 /// The algorithm first computes a topological ordering for the DAG by
@@ -431,7 +431,7 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
       // Collect leaf nodes.
       WorkList.push_back(SU);
     }
-  }  
+  }
 
   int Id = DAGSize;
   while (!WorkList.empty()) {
@@ -456,7 +456,7 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
     SUnit *SU = &SUnits[i];
     for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
          I != E; ++I) {
-      assert(Node2Index[SU->NodeNum] > Node2Index[I->getSUnit()->NodeNum] && 
+      assert(Node2Index[SU->NodeNum] > Node2Index[I->getSUnit()->NodeNum] &&
       "Wrong topological sorting");
     }
   }
@@ -494,7 +494,7 @@ void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) {
 void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
                                      bool& HasLoop) {
   std::vector<const SUnit*> WorkList;
-  WorkList.reserve(SUnits.size()); 
+  WorkList.reserve(SUnits.size());
 
   WorkList.push_back(SU);
   do {
@@ -504,20 +504,20 @@ void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
     for (int I = SU->Succs.size()-1; I >= 0; --I) {
       int s = SU->Succs[I].getSUnit()->NodeNum;
       if (Node2Index[s] == UpperBound) {
-        HasLoop = true; 
+        HasLoop = true;
         return;
       }
       // Visit successors if not already and in affected region.
       if (!Visited.test(s) && Node2Index[s] < UpperBound) {
         WorkList.push_back(SU->Succs[I].getSUnit());
-      } 
-    } 
+      }
+    }
   } while (!WorkList.empty());
 }
 
-/// Shift - Renumber the nodes so that the topological ordering is 
+/// Shift - Renumber the nodes so that the topological ordering is
 /// preserved.
-void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound, 
+void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound,
                                        int UpperBound) {
   std::vector<int> L;
   int shift = 0;
@@ -568,7 +568,7 @@ bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
   // Is Ord(TargetSU) < Ord(SU) ?
   if (LowerBound < UpperBound) {
     Visited.reset();
-    // There may be a path from TargetSU to SU. Check for it. 
+    // There may be a path from TargetSU to SU. Check for it.
     DFS(TargetSU, UpperBound, HasLoop);
   }
   return HasLoop;
@@ -580,8 +580,7 @@ void ScheduleDAGTopologicalSort::Allocate(int n, int index) {
   Index2Node[index] = n;
 }
 
-ScheduleDAGTopologicalSort::ScheduleDAGTopologicalSort(
-                                                     std::vector<SUnit> &sunits)
- : SUnits(sunits) {}
+ScheduleDAGTopologicalSort::
+ScheduleDAGTopologicalSort(std::vector<SUnit> &sunits) : SUnits(sunits) {}
 
 ScheduleHazardRecognizer::~ScheduleHazardRecognizer() {}
diff --git a/lib/CodeGen/ScheduleDAGEmit.cpp b/lib/CodeGen/ScheduleDAGEmit.cpp
index ee08e1dc0eea..0a2fb3796a42 100644
--- a/lib/CodeGen/ScheduleDAGEmit.cpp
+++ b/lib/CodeGen/ScheduleDAGEmit.cpp
@@ -50,11 +50,8 @@ void ScheduleDAG::EmitPhysRegCopy(SUnit *SU,
           break;
         }
       }
-      bool Success = TII->copyRegToReg(*BB, InsertPos, Reg, VRI->second,
-                                       SU->CopyDstRC, SU->CopySrcRC,
-                                       DebugLoc());
-      (void)Success;
-      assert(Success && "copyRegToReg failed!");
+      BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), Reg)
+        .addReg(VRI->second);
     } else {
       // Copy from physical register.
       assert(I->getReg() && "Unknown physical register!");
@@ -62,11 +59,8 @@ void ScheduleDAG::EmitPhysRegCopy(SUnit *SU,
       bool isNew = VRBaseMap.insert(std::make_pair(SU, VRBase)).second;
       isNew = isNew; // Silence compiler warning.
       assert(isNew && "Node emitted out of order - early");
-      bool Success = TII->copyRegToReg(*BB, InsertPos, VRBase, I->getReg(),
-                                       SU->CopyDstRC, SU->CopySrcRC,
-                                       DebugLoc());
-      (void)Success;
-      assert(Success && "copyRegToReg failed!");
+      BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), VRBase)
+        .addReg(I->getReg());
     }
     break;
   }
diff --git a/lib/CodeGen/ScheduleDAGInstrs.h b/lib/CodeGen/ScheduleDAGInstrs.h
index ad82db28f88b..d90659bb163e 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.h
+++ b/lib/CodeGen/ScheduleDAGInstrs.h
@@ -69,8 +69,10 @@ namespace llvm {
                      const SmallSet<unsigned, 8> &LoopLiveIns) {
       unsigned Count = 0;
       for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
-           I != E; ++I, ++Count) {
+           I != E; ++I) {
         const MachineInstr *MI = I;
+        if (MI->isDebugValue())
+          continue;
         for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
           const MachineOperand &MO = MI->getOperand(i);
           if (!MO.isReg() || !MO.isUse())
@@ -79,6 +81,7 @@ namespace llvm {
           if (LoopLiveIns.count(MOReg))
             Deps.insert(std::make_pair(MOReg, std::make_pair(&MO, Count)));
         }
+        ++Count; // Not every iteration due to dbg_value above.
       }
 
       const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
diff --git a/lib/CodeGen/SelectionDAG/CMakeLists.txt b/lib/CodeGen/SelectionDAG/CMakeLists.txt
index 0cfd5e1d7e21..799988a4c862 100644
--- a/lib/CodeGen/SelectionDAG/CMakeLists.txt
+++ b/lib/CodeGen/SelectionDAG/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_llvm_library(LLVMSelectionDAG
-  CallingConvLower.cpp
   DAGCombiner.cpp
   FastISel.cpp
   FunctionLoweringInfo.cpp
diff --git a/lib/CodeGen/SelectionDAG/CallingConvLower.cpp b/lib/CodeGen/SelectionDAG/CallingConvLower.cpp
deleted file mode 100644
index 4e6c1fcc9604..000000000000
--- a/lib/CodeGen/SelectionDAG/CallingConvLower.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-//===-- CallingConvLower.cpp - Calling Conventions ------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the CCState class, used for lowering and implementing
-// calling conventions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetMachine.h"
-using namespace llvm;
-
-CCState::CCState(CallingConv::ID CC, bool isVarArg, const TargetMachine &tm,
-                 SmallVector<CCValAssign, 16> &locs, LLVMContext &C)
-  : CallingConv(CC), IsVarArg(isVarArg), TM(tm),
-    TRI(*TM.getRegisterInfo()), Locs(locs), Context(C) {
-  // No stack is used.
-  StackOffset = 0;
-  
-  UsedRegs.resize((TRI.getNumRegs()+31)/32);
-}
-
-// HandleByVal - Allocate a stack slot large enough to pass an argument by
-// value. The size and alignment information of the argument is encoded in its
-// parameter attribute.
-void CCState::HandleByVal(unsigned ValNo, EVT ValVT,
-                          EVT LocVT, CCValAssign::LocInfo LocInfo,
-                          int MinSize, int MinAlign,
-                          ISD::ArgFlagsTy ArgFlags) {
-  unsigned Align = ArgFlags.getByValAlign();
-  unsigned Size  = ArgFlags.getByValSize();
-  if (MinSize > (int)Size)
-    Size = MinSize;
-  if (MinAlign > (int)Align)
-    Align = MinAlign;
-  unsigned Offset = AllocateStack(Size, Align);
-
-  addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-}
-
-/// MarkAllocated - Mark a register and all of its aliases as allocated.
-void CCState::MarkAllocated(unsigned Reg) {
-  UsedRegs[Reg/32] |= 1 << (Reg&31);
-  
-  if (const unsigned *RegAliases = TRI.getAliasSet(Reg))
-    for (; (Reg = *RegAliases); ++RegAliases)
-      UsedRegs[Reg/32] |= 1 << (Reg&31);
-}
-
-/// AnalyzeFormalArguments - Analyze an array of argument values,
-/// incorporating info about the formals into this state.
-void
-CCState::AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
-                                CCAssignFn Fn) {
-  unsigned NumArgs = Ins.size();
-
-  for (unsigned i = 0; i != NumArgs; ++i) {
-    EVT ArgVT = Ins[i].VT;
-    ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
-    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
-#ifndef NDEBUG
-      dbgs() << "Formal argument #" << i << " has unhandled type "
-             << ArgVT.getEVTString();
-#endif
-      llvm_unreachable(0);
-    }
-  }
-}
-
-/// CheckReturn - Analyze the return values of a function, returning true if
-/// the return can be performed without sret-demotion, and false otherwise.
-bool CCState::CheckReturn(const SmallVectorImpl<EVT> &OutTys,
-                          const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags,
-                          CCAssignFn Fn) {
-  // Determine which register each value should be copied into.
-  for (unsigned i = 0, e = OutTys.size(); i != e; ++i) {
-    EVT VT = OutTys[i];
-    ISD::ArgFlagsTy ArgFlags = ArgsFlags[i];
-    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this))
-      return false;
-  }
-  return true;
-}
-
-/// AnalyzeReturn - Analyze the returned values of a return,
-/// incorporating info about the result values into this state.
-void CCState::AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
-                            CCAssignFn Fn) {
-  // Determine which register each value should be copied into.
-  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
-    EVT VT = Outs[i].Val.getValueType();
-    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this)) {
-#ifndef NDEBUG
-      dbgs() << "Return operand #" << i << " has unhandled type "
-             << VT.getEVTString();
-#endif
-      llvm_unreachable(0);
-    }
-  }
-}
-
-
-/// AnalyzeCallOperands - Analyze the outgoing arguments to a call,
-/// incorporating info about the passed values into this state.
-void CCState::AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                  CCAssignFn Fn) {
-  unsigned NumOps = Outs.size();
-  for (unsigned i = 0; i != NumOps; ++i) {
-    EVT ArgVT = Outs[i].Val.getValueType();
-    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
-#ifndef NDEBUG
-      dbgs() << "Call operand #" << i << " has unhandled type "
-             << ArgVT.getEVTString();
-#endif
-      llvm_unreachable(0);
-    }
-  }
-}
-
-/// AnalyzeCallOperands - Same as above except it takes vectors of types
-/// and argument flags.
-void CCState::AnalyzeCallOperands(SmallVectorImpl<EVT> &ArgVTs,
-                                  SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
-                                  CCAssignFn Fn) {
-  unsigned NumOps = ArgVTs.size();
-  for (unsigned i = 0; i != NumOps; ++i) {
-    EVT ArgVT = ArgVTs[i];
-    ISD::ArgFlagsTy ArgFlags = Flags[i];
-    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
-#ifndef NDEBUG
-      dbgs() << "Call operand #" << i << " has unhandled type "
-             << ArgVT.getEVTString();
-#endif
-      llvm_unreachable(0);
-    }
-  }
-}
-
-/// AnalyzeCallResult - Analyze the return values of a call,
-/// incorporating info about the passed values into this state.
-void CCState::AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
-                                CCAssignFn Fn) {
-  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
-    EVT VT = Ins[i].VT;
-    ISD::ArgFlagsTy Flags = Ins[i].Flags;
-    if (Fn(i, VT, VT, CCValAssign::Full, Flags, *this)) {
-#ifndef NDEBUG
-      dbgs() << "Call result #" << i << " has unhandled type "
-             << VT.getEVTString();
-#endif
-      llvm_unreachable(0);
-    }
-  }
-}
-
-/// AnalyzeCallResult - Same as above except it's specialized for calls which
-/// produce a single value.
-void CCState::AnalyzeCallResult(EVT VT, CCAssignFn Fn) {
-  if (Fn(0, VT, VT, CCValAssign::Full, ISD::ArgFlagsTy(), *this)) {
-#ifndef NDEBUG
-    dbgs() << "Call result has unhandled type "
-           << VT.getEVTString();
-#endif
-    llvm_unreachable(0);
-  }
-}
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6bddd784fe86..e67175246457 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -211,6 +211,7 @@ namespace {
     SDValue visitBUILD_VECTOR(SDNode *N);
     SDValue visitCONCAT_VECTORS(SDNode *N);
     SDValue visitVECTOR_SHUFFLE(SDNode *N);
+    SDValue visitMEMBARRIER(SDNode *N);
 
     SDValue XformToShuffleWithZero(SDNode *N);
     SDValue ReassociateOps(unsigned Opc, DebugLoc DL, SDValue LHS, SDValue RHS);
@@ -668,7 +669,7 @@ SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
       ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT) ? ISD::ZEXTLOAD : ISD::EXTLOAD)
       : LD->getExtensionType();
     Replace = true;
-    return DAG.getExtLoad(ExtType, dl, PVT,
+    return DAG.getExtLoad(ExtType, PVT, dl,
                           LD->getChain(), LD->getBasePtr(),
                           LD->getSrcValue(), LD->getSrcValueOffset(),
                           MemVT, LD->isVolatile(),
@@ -890,7 +891,7 @@ bool DAGCombiner::PromoteLoad(SDValue Op) {
     ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
       ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT) ? ISD::ZEXTLOAD : ISD::EXTLOAD)
       : LD->getExtensionType();
-    SDValue NewLD = DAG.getExtLoad(ExtType, dl, PVT,
+    SDValue NewLD = DAG.getExtLoad(ExtType, PVT, dl,
                                    LD->getChain(), LD->getBasePtr(),
                                    LD->getSrcValue(), LD->getSrcValueOffset(),
                                    MemVT, LD->isVolatile(),
@@ -1079,6 +1080,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::BUILD_VECTOR:       return visitBUILD_VECTOR(N);
   case ISD::CONCAT_VECTORS:     return visitCONCAT_VECTORS(N);
   case ISD::VECTOR_SHUFFLE:     return visitVECTOR_SHUFFLE(N);
+  case ISD::MEMBARRIER:         return visitMEMBARRIER(N);
   }
   return SDValue();
 }
@@ -1313,7 +1315,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
     if (!LegalOperations && TLI.isOffsetFoldingLegal(GA) && N1C &&
         GA->getOpcode() == ISD::GlobalAddress)
-      return DAG.getGlobalAddress(GA->getGlobal(), VT,
+      return DAG.getGlobalAddress(GA->getGlobal(), N1C->getDebugLoc(), VT,
                                   GA->getOffset() +
                                     (uint64_t)N1C->getSExtValue());
   // fold ((c1-A)+c2) -> (c1+c2)-A
@@ -1550,7 +1552,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) {
       // fold (sub Sym, c) -> Sym-c
       if (N1C && GA->getOpcode() == ISD::GlobalAddress)
-        return DAG.getGlobalAddress(GA->getGlobal(), VT,
+        return DAG.getGlobalAddress(GA->getGlobal(), N1C->getDebugLoc(), VT,
                                     GA->getOffset() -
                                       (uint64_t)N1C->getSExtValue());
       // fold (sub Sym+c1, Sym+c2) -> c1-c2
@@ -2028,7 +2030,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   // fold (OP (zext x), (zext y)) -> (zext (OP x, y))
   // fold (OP (sext x), (sext y)) -> (sext (OP x, y))
   // fold (OP (aext x), (aext y)) -> (aext (OP x, y))
-  // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y))
+  // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free)
   //
   // do not sink logical op inside of a vector extend, since it may combine
   // into a vsetcc.
@@ -2038,7 +2040,10 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
        // Avoid infinite looping with PromoteIntBinOp.
        (N0.getOpcode() == ISD::ANY_EXTEND &&
         (!LegalTypes || TLI.isTypeDesirableForOp(N->getOpcode(), Op0VT))) ||
-       (N0.getOpcode() == ISD::TRUNCATE && TLI.isTypeLegal(Op0VT))) &&
+       (N0.getOpcode() == ISD::TRUNCATE &&
+        (!TLI.isZExtFree(VT, Op0VT) ||
+         !TLI.isTruncateFree(Op0VT, VT)) &&
+        TLI.isTypeLegal(Op0VT))) &&
       !VT.isVector() &&
       Op0VT == N1.getOperand(0).getValueType() &&
       (!LegalOperations || TLI.isOperationLegal(N->getOpcode(), Op0VT))) {
@@ -2193,7 +2198,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
                            BitWidth - MemVT.getScalarType().getSizeInBits())) &&
         ((!LegalOperations && !LN0->isVolatile()) ||
          TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
-      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N0.getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, VT, N0.getDebugLoc(),
                                        LN0->getChain(), LN0->getBasePtr(),
                                        LN0->getSrcValue(),
                                        LN0->getSrcValueOffset(), MemVT,
@@ -2216,7 +2221,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
                            BitWidth - MemVT.getScalarType().getSizeInBits())) &&
         ((!LegalOperations && !LN0->isVolatile()) ||
          TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
-      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N0.getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, VT, N0.getDebugLoc(),
                                        LN0->getChain(),
                                        LN0->getBasePtr(), LN0->getSrcValue(),
                                        LN0->getSrcValueOffset(), MemVT,
@@ -2250,7 +2255,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
           EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
           
           SDValue NewLoad = 
-            DAG.getExtLoad(ISD::ZEXTLOAD, LN0->getDebugLoc(), LoadResultTy,
+            DAG.getExtLoad(ISD::ZEXTLOAD, LoadResultTy, LN0->getDebugLoc(),
                            LN0->getChain(), LN0->getBasePtr(),
                            LN0->getSrcValue(), LN0->getSrcValueOffset(),
                            ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
@@ -2286,7 +2291,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
           
           EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
           SDValue Load =
-            DAG.getExtLoad(ISD::ZEXTLOAD, LN0->getDebugLoc(), LoadResultTy,
+            DAG.getExtLoad(ISD::ZEXTLOAD, LoadResultTy, LN0->getDebugLoc(),
                            LN0->getChain(), NewPtr,
                            LN0->getSrcValue(), LN0->getSrcValueOffset(),
                            ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
@@ -2317,7 +2322,8 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   }
 
   // fold (or x, undef) -> -1
-  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF) {
+  if (!LegalOperations &&
+      (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)) {
     EVT EltVT = VT.isVector() ? VT.getVectorElementType() : VT;
     return DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), VT);
   }
@@ -2425,6 +2431,11 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   if (SDNode *Rot = MatchRotate(N0, N1, N->getDebugLoc()))
     return SDValue(Rot, 0);
 
+  // Simplify the operands using demanded-bits information.
+  if (!VT.isVector() &&
+      SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
   return SDValue();
 }
 
@@ -3158,6 +3169,11 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
       return NewSRL;
   }
 
+  // Attempt to convert a srl of a load into a narrower zero-extending load.
+  SDValue NarrowLoad = ReduceLoadWidth(N);
+  if (NarrowLoad.getNode())
+    return NarrowLoad;
+
   // Here is a common situation. We want to optimize:
   //
   //   %a = ...
@@ -3487,7 +3503,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI);
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, VT, N->getDebugLoc(),
                                        LN0->getChain(),
                                        LN0->getBasePtr(), LN0->getSrcValue(),
                                        LN0->getSrcValueOffset(),
@@ -3531,7 +3547,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     EVT MemVT = LN0->getMemoryVT();
     if ((!LegalOperations && !LN0->isVolatile()) ||
         TLI.isLoadExtLegal(ISD::SEXTLOAD, MemVT)) {
-      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, VT, N->getDebugLoc(),
                                        LN0->getChain(),
                                        LN0->getBasePtr(), LN0->getSrcValue(),
                                        LN0->getSrcValueOffset(), MemVT,
@@ -3557,24 +3573,24 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
         // we know that the element size of the sext'd result matches the
         // element size of the compare operands.
       if (VT.getSizeInBits() == N0VT.getSizeInBits())
-	return DAG.getVSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
-			     N0.getOperand(1),
-			     cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getVSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
+                             N0.getOperand(1),
+                             cast<CondCodeSDNode>(N0.getOperand(2))->get());
       // If the desired elements are smaller or larger than the source
       // elements we can use a matching integer vector type and then
       // truncate/sign extend
       else {
-	EVT MatchingElementType =
-	  EVT::getIntegerVT(*DAG.getContext(),
-			    N0VT.getScalarType().getSizeInBits());
-	EVT MatchingVectorType =
-	  EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
-			   N0VT.getVectorNumElements());
-	SDValue VsetCC =
-	  DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
-			N0.getOperand(1),
-			cast<CondCodeSDNode>(N0.getOperand(2))->get());
-	return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
+        EVT MatchingElementType =
+          EVT::getIntegerVT(*DAG.getContext(),
+                            N0VT.getScalarType().getSizeInBits());
+        EVT MatchingVectorType =
+          EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
+                           N0VT.getVectorNumElements());
+        SDValue VsetCC =
+          DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
+                        N0.getOperand(1),
+                        cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
       }
     }
 
@@ -3635,10 +3651,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
 
   // fold (zext (truncate x)) -> (and x, mask)
   if (N0.getOpcode() == ISD::TRUNCATE &&
-      (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) &&
-      (!TLI.isTruncateFree(N0.getOperand(0).getValueType(),
-                           N0.getValueType()) ||
-       !TLI.isZExtFree(N0.getValueType(), VT))) {
+      (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT))) {
     SDValue Op = N0.getOperand(0);
     if (Op.getValueType().bitsLT(VT)) {
       Op = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, Op);
@@ -3679,7 +3692,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
       DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI);
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N->getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, VT, N->getDebugLoc(),
                                        LN0->getChain(),
                                        LN0->getBasePtr(), LN0->getSrcValue(),
                                        LN0->getSrcValueOffset(),
@@ -3723,7 +3736,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     EVT MemVT = LN0->getMemoryVT();
     if ((!LegalOperations && !LN0->isVolatile()) ||
         TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT)) {
-      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N->getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, VT, N->getDebugLoc(),
                                        LN0->getChain(),
                                        LN0->getBasePtr(), LN0->getSrcValue(),
                                        LN0->getSrcValueOffset(), MemVT,
@@ -3881,7 +3894,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ANY_EXTEND, SetCCs, TLI);
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, N->getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, VT, N->getDebugLoc(),
                                        LN0->getChain(),
                                        LN0->getBasePtr(), LN0->getSrcValue(),
                                        LN0->getSrcValueOffset(),
@@ -3925,8 +3938,9 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       N0.hasOneUse()) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     EVT MemVT = LN0->getMemoryVT();
-    SDValue ExtLoad = DAG.getExtLoad(LN0->getExtensionType(), N->getDebugLoc(),
-                                     VT, LN0->getChain(), LN0->getBasePtr(),
+    SDValue ExtLoad = DAG.getExtLoad(LN0->getExtensionType(), VT,
+                                     N->getDebugLoc(),
+                                     LN0->getChain(), LN0->getBasePtr(),
                                      LN0->getSrcValue(),
                                      LN0->getSrcValueOffset(), MemVT,
                                      LN0->isVolatile(), LN0->isNonTemporal(),
@@ -3950,24 +3964,24 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
         // we know that the element size of the sext'd result matches the
         // element size of the compare operands.
       if (VT.getSizeInBits() == N0VT.getSizeInBits())
-	return DAG.getVSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
-			     N0.getOperand(1),
-			     cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getVSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
+                             N0.getOperand(1),
+                             cast<CondCodeSDNode>(N0.getOperand(2))->get());
       // If the desired elements are smaller or larger than the source
       // elements we can use a matching integer vector type and then
       // truncate/sign extend
       else {
-	EVT MatchingElementType =
-	  EVT::getIntegerVT(*DAG.getContext(),
-			    N0VT.getScalarType().getSizeInBits());
-	EVT MatchingVectorType =
-	  EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
-			   N0VT.getVectorNumElements());
-	SDValue VsetCC =
-	  DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
-			N0.getOperand(1),
-			cast<CondCodeSDNode>(N0.getOperand(2))->get());
-	return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
+        EVT MatchingElementType =
+          EVT::getIntegerVT(*DAG.getContext(),
+                            N0VT.getScalarType().getSizeInBits());
+        EVT MatchingVectorType =
+          EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
+                           N0VT.getVectorNumElements());
+        SDValue VsetCC =
+          DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
+                        N0.getOperand(1),
+                        cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
       }
     }
 
@@ -4024,6 +4038,7 @@ SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
 /// extended, also fold the extension to form a extending load.
 SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
   unsigned Opc = N->getOpcode();
+
   ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -4040,6 +4055,15 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
     ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT();
     if (LegalOperations && !TLI.isLoadExtLegal(ISD::SEXTLOAD, ExtVT))
       return SDValue();
+  } else if (Opc == ISD::SRL) {
+    // Annother special-case: SRL is basically zero-extending a narrower
+    // value.
+    ExtType = ISD::ZEXTLOAD;
+    N0 = SDValue(N, 0);
+    ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    if (!N01) return SDValue();
+    ExtVT = EVT::getIntegerVT(*DAG.getContext(),
+                              VT.getSizeInBits() - N01->getZExtValue());
   }
 
   unsigned EVTBits = ExtVT.getSizeInBits();
@@ -4085,7 +4109,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
       ? DAG.getLoad(VT, N0.getDebugLoc(), LN0->getChain(), NewPtr,
                     LN0->getSrcValue(), LN0->getSrcValueOffset() + PtrOff,
                     LN0->isVolatile(), LN0->isNonTemporal(), NewAlign)
-      : DAG.getExtLoad(ExtType, N0.getDebugLoc(), VT, LN0->getChain(), NewPtr,
+      : DAG.getExtLoad(ExtType, VT, N0.getDebugLoc(), LN0->getChain(), NewPtr,
                        LN0->getSrcValue(), LN0->getSrcValueOffset() + PtrOff,
                        ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
                        NewAlign);
@@ -4172,7 +4196,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
        TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, VT, N->getDebugLoc(),
                                      LN0->getChain(),
                                      LN0->getBasePtr(), LN0->getSrcValue(),
                                      LN0->getSrcValueOffset(), EVT,
@@ -4189,7 +4213,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
        TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, VT, N->getDebugLoc(),
                                      LN0->getChain(),
                                      LN0->getBasePtr(), LN0->getSrcValue(),
                                      LN0->getSrcValueOffset(), EVT,
@@ -4243,8 +4267,17 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
 
   // fold (truncate (load x)) -> (smaller load x)
   // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
-  if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT))
-    return ReduceLoadWidth(N);
+  if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
+    SDValue Reduced = ReduceLoadWidth(N);
+    if (Reduced.getNode())
+      return Reduced;
+  }
+
+  // Simplify the operands using demanded-bits information.
+  if (!VT.isVector() &&
+      SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
   return SDValue();
 }
 
@@ -4943,7 +4976,7 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
        TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType()))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, N->getDebugLoc(), VT,
+    SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, VT, N->getDebugLoc(),
                                      LN0->getChain(),
                                      LN0->getBasePtr(), LN0->getSrcValue(),
                                      LN0->getSrcValueOffset(),
@@ -5527,8 +5560,8 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
   if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) {
     if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
       if (Align > LD->getAlignment())
-        return DAG.getExtLoad(LD->getExtensionType(), N->getDebugLoc(),
-                              LD->getValueType(0),
+        return DAG.getExtLoad(LD->getExtensionType(), LD->getValueType(0),
+                              N->getDebugLoc(),
                               Chain, Ptr, LD->getSrcValue(),
                               LD->getSrcValueOffset(), LD->getMemoryVT(),
                               LD->isVolatile(), LD->isNonTemporal(), Align);
@@ -5551,8 +5584,8 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
                                LD->isVolatile(), LD->isNonTemporal(),
                                LD->getAlignment());
       } else {
-        ReplLoad = DAG.getExtLoad(LD->getExtensionType(), LD->getDebugLoc(),
-                                  LD->getValueType(0),
+        ReplLoad = DAG.getExtLoad(LD->getExtensionType(), LD->getValueType(0),
+                                  LD->getDebugLoc(),
                                   BetterChain, Ptr, LD->getSrcValue(),
                                   LD->getSrcValueOffset(),
                                   LD->getMemoryVT(),
@@ -6077,7 +6110,6 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
    // Check if the result type doesn't match the inserted element type. A
    // SCALAR_TO_VECTOR may truncate the inserted element and the
    // EXTRACT_VECTOR_ELT may widen the extracted vector.
-   EVT EltVT = InVec.getValueType().getVectorElementType();
    SDValue InOp = InVec.getOperand(0);
    EVT NVT = N->getValueType(0);
    if (InOp.getValueType() != NVT) {
@@ -6277,8 +6309,6 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
 }
 
 SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
-  return SDValue();
-  
   EVT VT = N->getValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
 
@@ -6334,6 +6364,59 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitMEMBARRIER(SDNode* N) {
+  if (!TLI.getShouldFoldAtomicFences())
+    return SDValue();
+
+  SDValue atomic = N->getOperand(0);
+  switch (atomic.getOpcode()) {
+    case ISD::ATOMIC_CMP_SWAP:
+    case ISD::ATOMIC_SWAP:
+    case ISD::ATOMIC_LOAD_ADD:
+    case ISD::ATOMIC_LOAD_SUB:
+    case ISD::ATOMIC_LOAD_AND:
+    case ISD::ATOMIC_LOAD_OR:
+    case ISD::ATOMIC_LOAD_XOR:
+    case ISD::ATOMIC_LOAD_NAND:
+    case ISD::ATOMIC_LOAD_MIN:
+    case ISD::ATOMIC_LOAD_MAX:
+    case ISD::ATOMIC_LOAD_UMIN:
+    case ISD::ATOMIC_LOAD_UMAX:
+      break;
+    default:
+      return SDValue();
+  }
+
+  SDValue fence = atomic.getOperand(0);
+  if (fence.getOpcode() != ISD::MEMBARRIER)
+    return SDValue();
+
+  switch (atomic.getOpcode()) {
+    case ISD::ATOMIC_CMP_SWAP:
+      return SDValue(DAG.UpdateNodeOperands(atomic.getNode(),
+                                    fence.getOperand(0),
+                                    atomic.getOperand(1), atomic.getOperand(2),
+                                    atomic.getOperand(3)), atomic.getResNo());
+    case ISD::ATOMIC_SWAP:
+    case ISD::ATOMIC_LOAD_ADD:
+    case ISD::ATOMIC_LOAD_SUB:
+    case ISD::ATOMIC_LOAD_AND:
+    case ISD::ATOMIC_LOAD_OR:
+    case ISD::ATOMIC_LOAD_XOR:
+    case ISD::ATOMIC_LOAD_NAND:
+    case ISD::ATOMIC_LOAD_MIN:
+    case ISD::ATOMIC_LOAD_MAX:
+    case ISD::ATOMIC_LOAD_UMIN:
+    case ISD::ATOMIC_LOAD_UMAX:
+      return SDValue(DAG.UpdateNodeOperands(atomic.getNode(),
+                                    fence.getOperand(0),
+                                    atomic.getOperand(1), atomic.getOperand(2)),
+                     atomic.getResNo());
+    default:
+      return SDValue();
+  }
+}
+
 /// XformToShuffleWithZero - Returns a vector_shuffle if it able to transform
 /// an AND to a vector_shuffle with the destination vector and a zero vector.
 /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==>
@@ -6565,8 +6648,8 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
                                LLD->getAlignment());
           } else {
             Load = DAG.getExtLoad(LLD->getExtensionType(),
-                                  TheSelect->getDebugLoc(),
                                   TheSelect->getValueType(0),
+                                  TheSelect->getDebugLoc(),
                                   LLD->getChain(), Addr, 0, 0,
                                   LLD->getMemoryVT(),
                                   LLD->isVolatile(),
@@ -6807,38 +6890,34 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
     }
   }
 
-  // Check to see if this is an integer abs. select_cc setl[te] X, 0, -X, X ->
+  // Check to see if this is an integer abs.
+  // select_cc setg[te] X,  0,  X, -X ->
+  // select_cc setgt    X, -1,  X, -X ->
+  // select_cc setl[te] X,  0, -X,  X ->
+  // select_cc setlt    X,  1, -X,  X ->
   // Y = sra (X, size(X)-1); xor (add (X, Y), Y)
-  if (N1C && N1C->isNullValue() && (CC == ISD::SETLT || CC == ISD::SETLE) &&
-      N0 == N3 && N2.getOpcode() == ISD::SUB && N0 == N2.getOperand(1) &&
-      N2.getOperand(0) == N1 && N0.getValueType().isInteger()) {
+  if (N1C) {
+    ConstantSDNode *SubC = NULL;
+    if (((N1C->isNullValue() && (CC == ISD::SETGT || CC == ISD::SETGE)) ||
+         (N1C->isAllOnesValue() && CC == ISD::SETGT)) &&
+        N0 == N2 && N3.getOpcode() == ISD::SUB && N0 == N3.getOperand(1))
+      SubC = dyn_cast<ConstantSDNode>(N3.getOperand(0));
+    else if (((N1C->isNullValue() && (CC == ISD::SETLT || CC == ISD::SETLE)) ||
+              (N1C->isOne() && CC == ISD::SETLT)) &&
+             N0 == N3 && N2.getOpcode() == ISD::SUB && N0 == N2.getOperand(1))
+      SubC = dyn_cast<ConstantSDNode>(N2.getOperand(0));
+
     EVT XType = N0.getValueType();
-    SDValue Shift = DAG.getNode(ISD::SRA, N0.getDebugLoc(), XType, N0,
-                                DAG.getConstant(XType.getSizeInBits()-1,
-                                                getShiftAmountTy()));
-    SDValue Add = DAG.getNode(ISD::ADD, N0.getDebugLoc(), XType,
-                              N0, Shift);
-    AddToWorkList(Shift.getNode());
-    AddToWorkList(Add.getNode());
-    return DAG.getNode(ISD::XOR, DL, XType, Add, Shift);
-  }
-  // Check to see if this is an integer abs. select_cc setgt X, -1, X, -X ->
-  // Y = sra (X, size(X)-1); xor (add (X, Y), Y)
-  if (N1C && N1C->isAllOnesValue() && CC == ISD::SETGT &&
-      N0 == N2 && N3.getOpcode() == ISD::SUB && N0 == N3.getOperand(1)) {
-    if (ConstantSDNode *SubC = dyn_cast<ConstantSDNode>(N3.getOperand(0))) {
-      EVT XType = N0.getValueType();
-      if (SubC->isNullValue() && XType.isInteger()) {
-        SDValue Shift = DAG.getNode(ISD::SRA, N0.getDebugLoc(), XType,
-                                    N0,
-                                    DAG.getConstant(XType.getSizeInBits()-1,
-                                                    getShiftAmountTy()));
-        SDValue Add = DAG.getNode(ISD::ADD, N0.getDebugLoc(),
-                                  XType, N0, Shift);
-        AddToWorkList(Shift.getNode());
-        AddToWorkList(Add.getNode());
-        return DAG.getNode(ISD::XOR, DL, XType, Add, Shift);
-      }
+    if (SubC && SubC->isNullValue() && XType.isInteger()) {
+      SDValue Shift = DAG.getNode(ISD::SRA, N0.getDebugLoc(), XType,
+                                  N0,
+                                  DAG.getConstant(XType.getSizeInBits()-1,
+                                                  getShiftAmountTy()));
+      SDValue Add = DAG.getNode(ISD::ADD, N0.getDebugLoc(),
+                                XType, N0, Shift);
+      AddToWorkList(Shift.getNode());
+      AddToWorkList(Add.getNode());
+      return DAG.getNode(ISD::XOR, DL, XType, Add, Shift);
     }
   }
 
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 95f4d072e01f..3f7e4a5fac42 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -44,18 +44,38 @@
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "FunctionLoweringInfo.h"
 using namespace llvm;
 
+/// startNewBlock - Set the current block to which generated machine
+/// instructions will be appended, and clear the local CSE map.
+///
+void FastISel::startNewBlock() {
+  LocalValueMap.clear();
+
+  // Start out as null, meaining no local-value instructions have
+  // been emitted.
+  LastLocalValue = 0;
+
+  // Advance the last local value past any EH_LABEL instructions.
+  MachineBasicBlock::iterator
+    I = FuncInfo.MBB->begin(), E = FuncInfo.MBB->end();
+  while (I != E && I->getOpcode() == TargetOpcode::EH_LABEL) {
+    LastLocalValue = I;
+    ++I;
+  }
+}
+
 bool FastISel::hasTrivialKill(const Value *V) const {
   // Don't consider constants or arguments to have trivial kills.
   const Instruction *I = dyn_cast<Instruction>(V);
@@ -99,25 +119,31 @@ unsigned FastISel::getRegForValue(const Value *V) {
   // cache values defined by Instructions across blocks, and other values
   // only locally. This is because Instructions already have the SSA
   // def-dominates-use requirement enforced.
-  DenseMap<const Value *, unsigned>::iterator I = ValueMap.find(V);
-  if (I != ValueMap.end())
-    return I->second;
+  DenseMap<const Value *, unsigned>::iterator I = FuncInfo.ValueMap.find(V);
+  if (I != FuncInfo.ValueMap.end()) {
+    unsigned Reg = I->second;
+    return Reg;
+  }
   unsigned Reg = LocalValueMap[V];
   if (Reg != 0)
     return Reg;
 
   // In bottom-up mode, just create the virtual register which will be used
   // to hold the value. It will be materialized later.
-  if (IsBottomUp) {
-    Reg = createResultReg(TLI.getRegClassFor(VT));
-    if (isa<Instruction>(V))
-      ValueMap[V] = Reg;
-    else
-      LocalValueMap[V] = Reg;
-    return Reg;
-  }
+  if (isa<Instruction>(V) &&
+      (!isa<AllocaInst>(V) ||
+       !FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(V))))
+    return FuncInfo.InitializeRegForValue(V);
+
+  MachineBasicBlock::iterator SaveInsertPt = enterLocalValueArea();
+
+  // Materialize the value in a register. Emit any instructions in the
+  // local value area.
+  Reg = materializeRegForValue(V, VT);
 
-  return materializeRegForValue(V, VT);
+  leaveLocalValueArea(SaveInsertPt);
+
+  return Reg;
 }
 
 /// materializeRegForValue - Helper for getRegForVale. This function is
@@ -161,11 +187,15 @@ unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) {
       }
     }
   } else if (const Operator *Op = dyn_cast<Operator>(V)) {
-    if (!SelectOperator(Op, Op->getOpcode())) return 0;
-    Reg = LocalValueMap[Op];
+    if (!SelectOperator(Op, Op->getOpcode()))
+      if (!isa<Instruction>(Op) ||
+          !TargetSelectInstruction(cast<Instruction>(Op)))
+        return 0;
+    Reg = lookUpRegForValue(Op);
   } else if (isa<UndefValue>(V)) {
     Reg = createResultReg(TLI.getRegClassFor(VT));
-    BuildMI(MBB, DL, TII.get(TargetOpcode::IMPLICIT_DEF), Reg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(TargetOpcode::IMPLICIT_DEF), Reg);
   }
   
   // If target-independent code couldn't handle the value, give target-specific
@@ -175,8 +205,10 @@ unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) {
   
   // Don't cache constant materializations in the general ValueMap.
   // To do so would require tracking what uses they dominate.
-  if (Reg != 0)
+  if (Reg != 0) {
     LocalValueMap[V] = Reg;
+    LastLocalValue = MRI.getVRegDef(Reg);
+  }
   return Reg;
 }
 
@@ -185,8 +217,9 @@ unsigned FastISel::lookUpRegForValue(const Value *V) {
   // cache values defined by Instructions across blocks, and other values
   // only locally. This is because Instructions already have the SSA
   // def-dominates-use requirement enforced.
-  if (ValueMap.count(V))
-    return ValueMap[V];
+  DenseMap<const Value *, unsigned>::iterator I = FuncInfo.ValueMap.find(V);
+  if (I != FuncInfo.ValueMap.end())
+    return I->second;
   return LocalValueMap[V];
 }
 
@@ -202,14 +235,17 @@ unsigned FastISel::UpdateValueMap(const Value *I, unsigned Reg) {
     return Reg;
   }
   
-  unsigned &AssignedReg = ValueMap[I];
+  unsigned &AssignedReg = FuncInfo.ValueMap[I];
   if (AssignedReg == 0)
+    // Use the new register.
     AssignedReg = Reg;
   else if (Reg != AssignedReg) {
-    const TargetRegisterClass *RegClass = MRI.getRegClass(Reg);
-    TII.copyRegToReg(*MBB, MBB->end(), AssignedReg,
-                     Reg, RegClass, RegClass, DL);
+    // Arrange for uses of AssignedReg to be replaced by uses of Reg.
+    FuncInfo.RegFixups[AssignedReg] = Reg;
+
+    AssignedReg = Reg;
   }
+
   return AssignedReg;
 }
 
@@ -237,6 +273,33 @@ std::pair<unsigned, bool> FastISel::getRegForGEPIndex(const Value *Idx) {
   return std::pair<unsigned, bool>(IdxN, IdxNIsKill);
 }
 
+void FastISel::recomputeInsertPt() {
+  if (getLastLocalValue()) {
+    FuncInfo.InsertPt = getLastLocalValue();
+    ++FuncInfo.InsertPt;
+  } else
+    FuncInfo.InsertPt = FuncInfo.MBB->getFirstNonPHI();
+
+  // Now skip past any EH_LABELs, which must remain at the beginning.
+  while (FuncInfo.InsertPt != FuncInfo.MBB->end() &&
+         FuncInfo.InsertPt->getOpcode() == TargetOpcode::EH_LABEL)
+    ++FuncInfo.InsertPt;
+}
+
+MachineBasicBlock::iterator FastISel::enterLocalValueArea() {
+  MachineBasicBlock::iterator OldInsertPt = FuncInfo.InsertPt;
+  recomputeInsertPt();
+  return OldInsertPt;
+}
+
+void FastISel::leaveLocalValueArea(MachineBasicBlock::iterator OldInsertPt) {
+  if (FuncInfo.InsertPt != FuncInfo.MBB->begin())
+    LastLocalValue = llvm::prior(FuncInfo.InsertPt);
+
+  // Restore the previous insert position.
+  FuncInfo.InsertPt = OldInsertPt;
+}
+
 /// SelectBinaryOp - Select and emit code for a binary operator instruction,
 /// which has an opcode which directly corresponds to the given ISD opcode.
 ///
@@ -345,7 +408,7 @@ bool FastISel::SelectGetElementPtr(const User *I) {
 
       // If this is a constant subscript, handle it quickly.
       if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
-        if (CI->getZExtValue() == 0) continue;
+        if (CI->isZero()) continue;
         uint64_t Offs = 
           TD.getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
         N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, Offs, VT);
@@ -395,7 +458,7 @@ bool FastISel::SelectCall(const User *I) {
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst *DI = cast<DbgDeclareInst>(I);
     if (!DIVariable(DI->getVariable()).Verify() ||
-        !MF.getMMI().hasDebugInfo())
+        !FuncInfo.MF->getMMI().hasDebugInfo())
       return true;
 
     const Value *Address = DI->getAddress();
@@ -409,11 +472,12 @@ bool FastISel::SelectCall(const User *I) {
     // those are handled in SelectionDAGBuilder.
     if (AI) {
       DenseMap<const AllocaInst*, int>::iterator SI =
-        StaticAllocaMap.find(AI);
-      if (SI == StaticAllocaMap.end()) break; // VLAs.
+        FuncInfo.StaticAllocaMap.find(AI);
+      if (SI == FuncInfo.StaticAllocaMap.end()) break; // VLAs.
       int FI = SI->second;
       if (!DI->getDebugLoc().isUnknown())
-        MF.getMMI().setVariableDbgInfo(DI->getVariable(), FI, DI->getDebugLoc());
+        FuncInfo.MF->getMMI().setVariableDbgInfo(DI->getVariable(),
+                                                 FI, DI->getDebugLoc());
     } else
       // Building the map above is target independent.  Generating DBG_VALUE
       // inline is target dependent; do this now.
@@ -428,23 +492,28 @@ bool FastISel::SelectCall(const User *I) {
     if (!V) {
       // Currently the optimizer can produce this; insert an undef to
       // help debugging.  Probably the optimizer should not do this.
-      BuildMI(MBB, DL, II).addReg(0U).addImm(DI->getOffset()).
-                                     addMetadata(DI->getVariable());
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+        .addReg(0U).addImm(DI->getOffset())
+        .addMetadata(DI->getVariable());
     } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
-      BuildMI(MBB, DL, II).addImm(CI->getZExtValue()).addImm(DI->getOffset()).
-                                     addMetadata(DI->getVariable());
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+        .addImm(CI->getZExtValue()).addImm(DI->getOffset())
+        .addMetadata(DI->getVariable());
     } else if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
-      BuildMI(MBB, DL, II).addFPImm(CF).addImm(DI->getOffset()).
-                                     addMetadata(DI->getVariable());
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+        .addFPImm(CF).addImm(DI->getOffset())
+        .addMetadata(DI->getVariable());
     } else if (unsigned Reg = lookUpRegForValue(V)) {
-      BuildMI(MBB, DL, II).addReg(Reg, RegState::Debug).addImm(DI->getOffset()).
-                                     addMetadata(DI->getVariable());
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+        .addReg(Reg, RegState::Debug).addImm(DI->getOffset())
+        .addMetadata(DI->getVariable());
     } else {
       // We can't yet handle anything else here because it would require
       // generating code, thus altering codegen because of debug info.
       // Insert an undef so we can see what we dropped.
-      BuildMI(MBB, DL, II).addReg(0U).addImm(DI->getOffset()).
-                                     addMetadata(DI->getVariable());
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+        .addReg(0U).addImm(DI->getOffset())
+        .addMetadata(DI->getVariable());
     }     
     return true;
   }
@@ -453,14 +522,13 @@ bool FastISel::SelectCall(const User *I) {
     switch (TLI.getOperationAction(ISD::EXCEPTIONADDR, VT)) {
     default: break;
     case TargetLowering::Expand: {
-      assert(MBB->isLandingPad() && "Call to eh.exception not in landing pad!");
+      assert(FuncInfo.MBB->isLandingPad() &&
+             "Call to eh.exception not in landing pad!");
       unsigned Reg = TLI.getExceptionAddressRegister();
       const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
       unsigned ResultReg = createResultReg(RC);
-      bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
-                                           Reg, RC, RC, DL);
-      assert(InsertedCopy && "Can't copy address registers!");
-      InsertedCopy = InsertedCopy;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+              ResultReg).addReg(Reg);
       UpdateValueMap(I, ResultReg);
       return true;
     }
@@ -472,25 +540,23 @@ bool FastISel::SelectCall(const User *I) {
     switch (TLI.getOperationAction(ISD::EHSELECTION, VT)) {
     default: break;
     case TargetLowering::Expand: {
-      if (MBB->isLandingPad())
-        AddCatchInfo(*cast<CallInst>(I), &MF.getMMI(), MBB);
+      if (FuncInfo.MBB->isLandingPad())
+        AddCatchInfo(*cast<CallInst>(I), &FuncInfo.MF->getMMI(), FuncInfo.MBB);
       else {
 #ifndef NDEBUG
-        CatchInfoLost.insert(cast<CallInst>(I));
+        FuncInfo.CatchInfoLost.insert(cast<CallInst>(I));
 #endif
         // FIXME: Mark exception selector register as live in.  Hack for PR1508.
         unsigned Reg = TLI.getExceptionSelectorRegister();
-        if (Reg) MBB->addLiveIn(Reg);
+        if (Reg) FuncInfo.MBB->addLiveIn(Reg);
       }
 
       unsigned Reg = TLI.getExceptionSelectorRegister();
       EVT SrcVT = TLI.getPointerTy();
       const TargetRegisterClass *RC = TLI.getRegClassFor(SrcVT);
       unsigned ResultReg = createResultReg(RC);
-      bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg, Reg,
-                                           RC, RC, DL);
-      assert(InsertedCopy && "Can't copy address registers!");
-      InsertedCopy = InsertedCopy;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+              ResultReg).addReg(Reg);
 
       bool ResultRegIsKill = hasTrivialKill(I);
 
@@ -605,12 +671,12 @@ bool FastISel::SelectBitCast(const User *I) {
   if (SrcVT.getSimpleVT() == DstVT.getSimpleVT()) {
     TargetRegisterClass* SrcClass = TLI.getRegClassFor(SrcVT);
     TargetRegisterClass* DstClass = TLI.getRegClassFor(DstVT);
-    ResultReg = createResultReg(DstClass);
-    
-    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
-                                         Op0, DstClass, SrcClass, DL);
-    if (!InsertedCopy)
-      ResultReg = 0;
+    // Don't attempt a cross-class copy. It will likely fail.
+    if (SrcClass == DstClass) {
+      ResultReg = createResultReg(DstClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+              ResultReg).addReg(Op0);
+    }
   }
   
   // If the reg-reg copy failed, select a BIT_CONVERT opcode.
@@ -655,14 +721,15 @@ FastISel::SelectInstruction(const Instruction *I) {
 /// unless it is the immediate (fall-through) successor, and update
 /// the CFG.
 void
-FastISel::FastEmitBranch(MachineBasicBlock *MSucc) {
-  if (MBB->isLayoutSuccessor(MSucc)) {
+FastISel::FastEmitBranch(MachineBasicBlock *MSucc, DebugLoc DL) {
+  if (FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
     // The unconditional fall-through case, which needs no instructions.
   } else {
     // The unconditional branch case.
-    TII.InsertBranch(*MBB, MSucc, NULL, SmallVector<MachineOperand, 0>());
+    TII.InsertBranch(*FuncInfo.MBB, MSucc, NULL,
+                     SmallVector<MachineOperand, 0>(), DL);
   }
-  MBB->addSuccessor(MSucc);
+  FuncInfo.MBB->addSuccessor(MSucc);
 }
 
 /// SelectFNeg - Emit an FNeg operation.
@@ -711,9 +778,40 @@ FastISel::SelectFNeg(const User *I) {
   return true;
 }
 
+bool
+FastISel::SelectLoad(const User *I) {
+  LoadInst *LI = const_cast<LoadInst *>(cast<LoadInst>(I));
+
+  // For a load from an alloca, make a limited effort to find the value
+  // already available in a register, avoiding redundant loads.
+  if (!LI->isVolatile() && isa<AllocaInst>(LI->getPointerOperand())) {
+    BasicBlock::iterator ScanFrom = LI;
+    if (const Value *V = FindAvailableLoadedValue(LI->getPointerOperand(),
+                                                  LI->getParent(), ScanFrom)) {
+      if (!V->use_empty() &&
+          (!isa<Instruction>(V) ||
+           cast<Instruction>(V)->getParent() == LI->getParent() ||
+           (isa<AllocaInst>(V) &&
+            FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(V)))) &&
+          (!isa<Argument>(V) ||
+           LI->getParent() == &LI->getParent()->getParent()->getEntryBlock())) {
+      unsigned ResultReg = getRegForValue(V);
+      if (ResultReg != 0) {
+        UpdateValueMap(I, ResultReg);
+        return true;
+      }
+      }
+    }
+  }
+
+  return false;
+}
+
 bool
 FastISel::SelectOperator(const User *I, unsigned Opcode) {
   switch (Opcode) {
+  case Instruction::Load:
+    return SelectLoad(I);
   case Instruction::Add:
     return SelectBinaryOp(I, ISD::ADD);
   case Instruction::FAdd:
@@ -762,8 +860,8 @@ FastISel::SelectOperator(const User *I, unsigned Opcode) {
 
     if (BI->isUnconditional()) {
       const BasicBlock *LLVMSucc = BI->getSuccessor(0);
-      MachineBasicBlock *MSucc = MBBMap[LLVMSucc];
-      FastEmitBranch(MSucc);
+      MachineBasicBlock *MSucc = FuncInfo.MBBMap[LLVMSucc];
+      FastEmitBranch(MSucc, BI->getDebugLoc());
       return true;
     }
 
@@ -778,7 +876,7 @@ FastISel::SelectOperator(const User *I, unsigned Opcode) {
 
   case Instruction::Alloca:
     // FunctionLowering has the static-sized case covered.
-    if (StaticAllocaMap.count(cast<AllocaInst>(I)))
+    if (FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(I)))
       return true;
 
     // Dynamic-sized alloca is not handled yet.
@@ -824,32 +922,16 @@ FastISel::SelectOperator(const User *I, unsigned Opcode) {
   }
 }
 
-FastISel::FastISel(MachineFunction &mf,
-                   DenseMap<const Value *, unsigned> &vm,
-                   DenseMap<const BasicBlock *, MachineBasicBlock *> &bm,
-                   DenseMap<const AllocaInst *, int> &am,
-                   std::vector<std::pair<MachineInstr*, unsigned> > &pn
-#ifndef NDEBUG
-                   , SmallSet<const Instruction *, 8> &cil
-#endif
-                   )
-  : MBB(0),
-    ValueMap(vm),
-    MBBMap(bm),
-    StaticAllocaMap(am),
-    PHINodesToUpdate(pn),
-#ifndef NDEBUG
-    CatchInfoLost(cil),
-#endif
-    MF(mf),
-    MRI(MF.getRegInfo()),
-    MFI(*MF.getFrameInfo()),
-    MCP(*MF.getConstantPool()),
-    TM(MF.getTarget()),
+FastISel::FastISel(FunctionLoweringInfo &funcInfo)
+  : FuncInfo(funcInfo),
+    MRI(FuncInfo.MF->getRegInfo()),
+    MFI(*FuncInfo.MF->getFrameInfo()),
+    MCP(*FuncInfo.MF->getConstantPool()),
+    TM(FuncInfo.MF->getTarget()),
     TD(*TM.getTargetData()),
     TII(*TM.getInstrInfo()),
     TLI(*TM.getTargetLowering()),
-    IsBottomUp(false) {
+    TRI(*TM.getRegisterInfo()) {
 }
 
 FastISel::~FastISel() {}
@@ -978,7 +1060,7 @@ unsigned FastISel::FastEmitInst_(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const TargetInstrDesc &II = TII.get(MachineInstOpcode);
 
-  BuildMI(MBB, DL, II, ResultReg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg);
   return ResultReg;
 }
 
@@ -989,13 +1071,13 @@ unsigned FastISel::FastEmitInst_r(unsigned MachineInstOpcode,
   const TargetInstrDesc &II = TII.get(MachineInstOpcode);
 
   if (II.getNumDefs() >= 1)
-    BuildMI(MBB, DL, II, ResultReg).addReg(Op0, Op0IsKill * RegState::Kill);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+      .addReg(Op0, Op0IsKill * RegState::Kill);
   else {
-    BuildMI(MBB, DL, II).addReg(Op0, Op0IsKill * RegState::Kill);
-    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
-                                         II.ImplicitDefs[0], RC, RC, DL);
-    if (!InsertedCopy)
-      ResultReg = 0;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+      .addReg(Op0, Op0IsKill * RegState::Kill);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
   }
 
   return ResultReg;
@@ -1009,17 +1091,15 @@ unsigned FastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
   const TargetInstrDesc &II = TII.get(MachineInstOpcode);
 
   if (II.getNumDefs() >= 1)
-    BuildMI(MBB, DL, II, ResultReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
       .addReg(Op0, Op0IsKill * RegState::Kill)
       .addReg(Op1, Op1IsKill * RegState::Kill);
   else {
-    BuildMI(MBB, DL, II)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
       .addReg(Op0, Op0IsKill * RegState::Kill)
       .addReg(Op1, Op1IsKill * RegState::Kill);
-    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
-                                         II.ImplicitDefs[0], RC, RC, DL);
-    if (!InsertedCopy)
-      ResultReg = 0;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
@@ -1032,17 +1112,15 @@ unsigned FastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
   const TargetInstrDesc &II = TII.get(MachineInstOpcode);
 
   if (II.getNumDefs() >= 1)
-    BuildMI(MBB, DL, II, ResultReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
       .addReg(Op0, Op0IsKill * RegState::Kill)
       .addImm(Imm);
   else {
-    BuildMI(MBB, DL, II)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
       .addReg(Op0, Op0IsKill * RegState::Kill)
       .addImm(Imm);
-    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
-                                         II.ImplicitDefs[0], RC, RC, DL);
-    if (!InsertedCopy)
-      ResultReg = 0;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
@@ -1055,17 +1133,15 @@ unsigned FastISel::FastEmitInst_rf(unsigned MachineInstOpcode,
   const TargetInstrDesc &II = TII.get(MachineInstOpcode);
 
   if (II.getNumDefs() >= 1)
-    BuildMI(MBB, DL, II, ResultReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
       .addReg(Op0, Op0IsKill * RegState::Kill)
       .addFPImm(FPImm);
   else {
-    BuildMI(MBB, DL, II)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
       .addReg(Op0, Op0IsKill * RegState::Kill)
       .addFPImm(FPImm);
-    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
-                                         II.ImplicitDefs[0], RC, RC, DL);
-    if (!InsertedCopy)
-      ResultReg = 0;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
@@ -1079,19 +1155,17 @@ unsigned FastISel::FastEmitInst_rri(unsigned MachineInstOpcode,
   const TargetInstrDesc &II = TII.get(MachineInstOpcode);
 
   if (II.getNumDefs() >= 1)
-    BuildMI(MBB, DL, II, ResultReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
       .addReg(Op0, Op0IsKill * RegState::Kill)
       .addReg(Op1, Op1IsKill * RegState::Kill)
       .addImm(Imm);
   else {
-    BuildMI(MBB, DL, II)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
       .addReg(Op0, Op0IsKill * RegState::Kill)
       .addReg(Op1, Op1IsKill * RegState::Kill)
       .addImm(Imm);
-    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
-                                         II.ImplicitDefs[0], RC, RC, DL);
-    if (!InsertedCopy)
-      ResultReg = 0;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
@@ -1103,13 +1177,11 @@ unsigned FastISel::FastEmitInst_i(unsigned MachineInstOpcode,
   const TargetInstrDesc &II = TII.get(MachineInstOpcode);
   
   if (II.getNumDefs() >= 1)
-    BuildMI(MBB, DL, II, ResultReg).addImm(Imm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg).addImm(Imm);
   else {
-    BuildMI(MBB, DL, II).addImm(Imm);
-    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
-                                         II.ImplicitDefs[0], RC, RC, DL);
-    if (!InsertedCopy)
-      ResultReg = 0;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II).addImm(Imm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
@@ -1117,24 +1189,12 @@ unsigned FastISel::FastEmitInst_i(unsigned MachineInstOpcode,
 unsigned FastISel::FastEmitInst_extractsubreg(MVT RetVT,
                                               unsigned Op0, bool Op0IsKill,
                                               uint32_t Idx) {
-  const TargetRegisterClass* RC = MRI.getRegClass(Op0);
-  
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
-  const TargetInstrDesc &II = TII.get(TargetOpcode::EXTRACT_SUBREG);
-  
-  if (II.getNumDefs() >= 1)
-    BuildMI(MBB, DL, II, ResultReg)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addImm(Idx);
-  else {
-    BuildMI(MBB, DL, II)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addImm(Idx);
-    bool InsertedCopy = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
-                                         II.ImplicitDefs[0], RC, RC, DL);
-    if (!InsertedCopy)
-      ResultReg = 0;
-  }
+  assert(TargetRegisterInfo::isVirtualRegister(Op0) &&
+         "Cannot yet extract from physregs");
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+          DL, TII.get(TargetOpcode::COPY), ResultReg)
+    .addReg(Op0, getKillRegState(Op0IsKill), Idx);
   return ResultReg;
 }
 
@@ -1154,14 +1214,14 @@ bool FastISel::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
   const TerminatorInst *TI = LLVMBB->getTerminator();
 
   SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
-  unsigned OrigNumPHINodesToUpdate = PHINodesToUpdate.size();
+  unsigned OrigNumPHINodesToUpdate = FuncInfo.PHINodesToUpdate.size();
 
   // Check successor nodes' PHI nodes that expect a constant to be available
   // from this block.
   for (unsigned succ = 0, e = TI->getNumSuccessors(); succ != e; ++succ) {
     const BasicBlock *SuccBB = TI->getSuccessor(succ);
     if (!isa<PHINode>(SuccBB->begin())) continue;
-    MachineBasicBlock *SuccMBB = MBBMap[SuccBB];
+    MachineBasicBlock *SuccMBB = FuncInfo.MBBMap[SuccBB];
 
     // If this terminator has multiple identical successors (common for
     // switches), only handle each succ once.
@@ -1182,7 +1242,7 @@ bool FastISel::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
       // by bailing out early, we may leave behind some dead instructions,
       // since SelectionDAG's HandlePHINodesInSuccessorBlocks will insert its
       // own moves. Second, this check is necessary becuase FastISel doesn't
-      // use CreateRegForValue to create registers, so it always creates
+      // use CreateRegs to create registers, so it always creates
       // exactly one register for each non-void instruction.
       EVT VT = TLI.getValueType(PN->getType(), /*AllowUnknown=*/true);
       if (VT == MVT::Other || !TLI.isTypeLegal(VT)) {
@@ -1190,7 +1250,7 @@ bool FastISel::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
         if (VT == MVT::i1)
           VT = TLI.getTypeToTransformTo(LLVMBB->getContext(), VT);
         else {
-          PHINodesToUpdate.resize(OrigNumPHINodesToUpdate);
+          FuncInfo.PHINodesToUpdate.resize(OrigNumPHINodesToUpdate);
           return false;
         }
       }
@@ -1205,10 +1265,10 @@ bool FastISel::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
 
       unsigned Reg = getRegForValue(PHIOp);
       if (Reg == 0) {
-        PHINodesToUpdate.resize(OrigNumPHINodesToUpdate);
+        FuncInfo.PHINodesToUpdate.resize(OrigNumPHINodesToUpdate);
         return false;
       }
-      PHINodesToUpdate.push_back(std::make_pair(MBBI++, Reg));
+      FuncInfo.PHINodesToUpdate.push_back(std::make_pair(MBBI++, Reg));
       DL = DebugLoc();
     }
   }
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 65c36c1289db..928e1ecd4cf4 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -13,7 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "function-lowering-info"
-#include "FunctionLoweringInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
@@ -30,7 +30,6 @@
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetFrameInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/Debug.h"
@@ -47,9 +46,11 @@ static bool isUsedOutsideOfDefiningBlock(const Instruction *I) {
   if (isa<PHINode>(I)) return true;
   const BasicBlock *BB = I->getParent();
   for (Value::const_use_iterator UI = I->use_begin(), E = I->use_end();
-        UI != E; ++UI)
-    if (cast<Instruction>(*UI)->getParent() != BB || isa<PHINode>(*UI))
+        UI != E; ++UI) {
+    const User *U = *UI;
+    if (cast<Instruction>(U)->getParent() != BB || isa<PHINode>(U))
       return true;
+  }
   return false;
 }
 
@@ -64,9 +65,11 @@ static bool isOnlyUsedInEntryBlock(const Argument *A, bool EnableFastISel) {
 
   const BasicBlock *Entry = A->getParent()->begin();
   for (Value::const_use_iterator UI = A->use_begin(), E = A->use_end();
-       UI != E; ++UI)
-    if (cast<Instruction>(*UI)->getParent() != Entry || isa<SwitchInst>(*UI))
+       UI != E; ++UI) {
+    const User *U = *UI;
+    if (cast<Instruction>(U)->getParent() != Entry || isa<SwitchInst>(U))
       return false;  // Use not in entry block.
+  }
   return true;
 }
 
@@ -74,12 +77,18 @@ FunctionLoweringInfo::FunctionLoweringInfo(const TargetLowering &tli)
   : TLI(tli) {
 }
 
-void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
-                               bool EnableFastISel) {
+void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf) {
   Fn = &fn;
   MF = &mf;
   RegInfo = &MF->getRegInfo();
 
+  // Check whether the function can return without sret-demotion.
+  SmallVector<ISD::OutputArg, 4> Outs;
+  GetReturnInfo(Fn->getReturnType(),
+                Fn->getAttributes().getRetAttributes(), Outs, TLI);
+  CanLowerReturn = TLI.CanLowerReturn(Fn->getCallingConv(), Fn->isVarArg(),
+                                      Outs, Fn->getContext());
+
   // Create a vreg for each argument register that is not dead and is used
   // outside of the entry block for the function.
   for (Function::const_arg_iterator AI = Fn->arg_begin(), E = Fn->arg_end();
@@ -172,31 +181,33 @@ void FunctionLoweringInfo::clear() {
 #endif
   LiveOutRegInfo.clear();
   ArgDbgValues.clear();
+  RegFixups.clear();
 }
 
-unsigned FunctionLoweringInfo::MakeReg(EVT VT) {
+/// CreateReg - Allocate a single virtual register for the given type.
+unsigned FunctionLoweringInfo::CreateReg(EVT VT) {
   return RegInfo->createVirtualRegister(TLI.getRegClassFor(VT));
 }
 
-/// CreateRegForValue - Allocate the appropriate number of virtual registers of
+/// CreateRegs - Allocate the appropriate number of virtual registers of
 /// the correctly promoted or expanded types.  Assign these registers
 /// consecutive vreg numbers and return the first assigned number.
 ///
 /// In the case that the given value has struct or array type, this function
 /// will assign registers for each member or element.
 ///
-unsigned FunctionLoweringInfo::CreateRegForValue(const Value *V) {
+unsigned FunctionLoweringInfo::CreateRegs(const Type *Ty) {
   SmallVector<EVT, 4> ValueVTs;
-  ComputeValueVTs(TLI, V->getType(), ValueVTs);
+  ComputeValueVTs(TLI, Ty, ValueVTs);
 
   unsigned FirstReg = 0;
   for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
     EVT ValueVT = ValueVTs[Value];
-    EVT RegisterVT = TLI.getRegisterType(V->getContext(), ValueVT);
+    EVT RegisterVT = TLI.getRegisterType(Ty->getContext(), ValueVT);
 
-    unsigned NumRegs = TLI.getNumRegisters(V->getContext(), ValueVT);
+    unsigned NumRegs = TLI.getNumRegisters(Ty->getContext(), ValueVT);
     for (unsigned i = 0; i != NumRegs; ++i) {
-      unsigned R = MakeReg(RegisterVT);
+      unsigned R = CreateReg(RegisterVT);
       if (!FirstReg) FirstReg = R;
     }
   }
@@ -208,7 +219,7 @@ unsigned FunctionLoweringInfo::CreateRegForValue(const Value *V) {
 void llvm::AddCatchInfo(const CallInst &I, MachineModuleInfo *MMI,
                         MachineBasicBlock *MBB) {
   // Inform the MachineModuleInfo of the personality for this landing pad.
-  const ConstantExpr *CE = cast<ConstantExpr>(I.getOperand(2));
+  const ConstantExpr *CE = cast<ConstantExpr>(I.getArgOperand(1));
   assert(CE->getOpcode() == Instruction::BitCast &&
          isa<Function>(CE->getOperand(0)) &&
          "Personality should be a function");
@@ -217,18 +228,18 @@ void llvm::AddCatchInfo(const CallInst &I, MachineModuleInfo *MMI,
   // Gather all the type infos for this landing pad and pass them along to
   // MachineModuleInfo.
   std::vector<const GlobalVariable *> TyInfo;
-  unsigned N = I.getNumOperands();
+  unsigned N = I.getNumArgOperands();
 
-  for (unsigned i = N - 1; i > 2; --i) {
-    if (const ConstantInt *CI = dyn_cast<ConstantInt>(I.getOperand(i))) {
+  for (unsigned i = N - 1; i > 1; --i) {
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(i))) {
       unsigned FilterLength = CI->getZExtValue();
       unsigned FirstCatch = i + FilterLength + !FilterLength;
-      assert (FirstCatch <= N && "Invalid filter length");
+      assert(FirstCatch <= N && "Invalid filter length");
 
       if (FirstCatch < N) {
         TyInfo.reserve(N - FirstCatch);
         for (unsigned j = FirstCatch; j < N; ++j)
-          TyInfo.push_back(ExtractTypeInfo(I.getOperand(j)));
+          TyInfo.push_back(ExtractTypeInfo(I.getArgOperand(j)));
         MMI->addCatchTypeInfo(MBB, TyInfo);
         TyInfo.clear();
       }
@@ -240,7 +251,7 @@ void llvm::AddCatchInfo(const CallInst &I, MachineModuleInfo *MMI,
         // Filter.
         TyInfo.reserve(FilterLength - 1);
         for (unsigned j = i + 1; j < FirstCatch; ++j)
-          TyInfo.push_back(ExtractTypeInfo(I.getOperand(j)));
+          TyInfo.push_back(ExtractTypeInfo(I.getArgOperand(j)));
         MMI->addFilterTypeInfo(MBB, TyInfo);
         TyInfo.clear();
       }
@@ -249,10 +260,10 @@ void llvm::AddCatchInfo(const CallInst &I, MachineModuleInfo *MMI,
     }
   }
 
-  if (N > 3) {
-    TyInfo.reserve(N - 3);
-    for (unsigned j = 3; j < N; ++j)
-      TyInfo.push_back(ExtractTypeInfo(I.getOperand(j)));
+  if (N > 2) {
+    TyInfo.reserve(N - 2);
+    for (unsigned j = 2; j < N; ++j)
+      TyInfo.push_back(ExtractTypeInfo(I.getArgOperand(j)));
     MMI->addCatchTypeInfo(MBB, TyInfo);
   }
 }
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.h b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.h
deleted file mode 100644
index 4067a5b33044..000000000000
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.h
+++ /dev/null
@@ -1,144 +0,0 @@
-//===-- FunctionLoweringInfo.h - Lower functions from LLVM IR to CodeGen --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This implements routines for translating functions from LLVM IR into
-// Machine IR.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef FUNCTIONLOWERINGINFO_H
-#define FUNCTIONLOWERINGINFO_H
-
-#include "llvm/InlineAsm.h"
-#include "llvm/Instructions.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#ifndef NDEBUG
-#include "llvm/ADT/SmallSet.h"
-#endif
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/Support/CallSite.h"
-#include <vector>
-
-namespace llvm {
-
-class AllocaInst;
-class BasicBlock;
-class CallInst;
-class Function;
-class GlobalVariable;
-class Instruction;
-class MachineInstr;
-class MachineBasicBlock;
-class MachineFunction;
-class MachineModuleInfo;
-class MachineRegisterInfo;
-class TargetLowering;
-class Value;
-
-//===--------------------------------------------------------------------===//
-/// FunctionLoweringInfo - This contains information that is global to a
-/// function that is used when lowering a region of the function.
-///
-class FunctionLoweringInfo {
-public:
-  const TargetLowering &TLI;
-  const Function *Fn;
-  MachineFunction *MF;
-  MachineRegisterInfo *RegInfo;
-
-  /// CanLowerReturn - true iff the function's return value can be lowered to
-  /// registers.
-  bool CanLowerReturn;
-
-  /// DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg
-  /// allocated to hold a pointer to the hidden sret parameter.
-  unsigned DemoteRegister;
-
-  /// MBBMap - A mapping from LLVM basic blocks to their machine code entry.
-  DenseMap<const BasicBlock*, MachineBasicBlock *> MBBMap;
-
-  /// ValueMap - Since we emit code for the function a basic block at a time,
-  /// we must remember which virtual registers hold the values for
-  /// cross-basic-block values.
-  DenseMap<const Value*, unsigned> ValueMap;
-
-  /// StaticAllocaMap - Keep track of frame indices for fixed sized allocas in
-  /// the entry block.  This allows the allocas to be efficiently referenced
-  /// anywhere in the function.
-  DenseMap<const AllocaInst*, int> StaticAllocaMap;
-
-  /// ArgDbgValues - A list of DBG_VALUE instructions created during isel for
-  /// function arguments that are inserted after scheduling is completed.
-  SmallVector<MachineInstr*, 8> ArgDbgValues;
-
-#ifndef NDEBUG
-  SmallSet<const Instruction *, 8> CatchInfoLost;
-  SmallSet<const Instruction *, 8> CatchInfoFound;
-#endif
-
-  struct LiveOutInfo {
-    unsigned NumSignBits;
-    APInt KnownOne, KnownZero;
-    LiveOutInfo() : NumSignBits(0), KnownOne(1, 0), KnownZero(1, 0) {}
-  };
-  
-  /// LiveOutRegInfo - Information about live out vregs, indexed by their
-  /// register number offset by 'FirstVirtualRegister'.
-  std::vector<LiveOutInfo> LiveOutRegInfo;
-
-  /// PHINodesToUpdate - A list of phi instructions whose operand list will
-  /// be updated after processing the current basic block.
-  /// TODO: This isn't per-function state, it's per-basic-block state. But
-  /// there's no other convenient place for it to live right now.
-  std::vector<std::pair<MachineInstr*, unsigned> > PHINodesToUpdate;
-
-  explicit FunctionLoweringInfo(const TargetLowering &TLI);
-
-  /// set - Initialize this FunctionLoweringInfo with the given Function
-  /// and its associated MachineFunction.
-  ///
-  void set(const Function &Fn, MachineFunction &MF, bool EnableFastISel);
-
-  /// clear - Clear out all the function-specific state. This returns this
-  /// FunctionLoweringInfo to an empty state, ready to be used for a
-  /// different function.
-  void clear();
-
-  unsigned MakeReg(EVT VT);
-  
-  /// isExportedInst - Return true if the specified value is an instruction
-  /// exported from its block.
-  bool isExportedInst(const Value *V) {
-    return ValueMap.count(V);
-  }
-
-  unsigned CreateRegForValue(const Value *V);
-  
-  unsigned InitializeRegForValue(const Value *V) {
-    unsigned &R = ValueMap[V];
-    assert(R == 0 && "Already initialized this value register!");
-    return R = CreateRegForValue(V);
-  }
-};
-
-/// AddCatchInfo - Extract the personality and type infos from an eh.selector
-/// call, and add them to the specified machine basic block.
-void AddCatchInfo(const CallInst &I,
-                  MachineModuleInfo *MMI, MachineBasicBlock *MBB);
-
-/// CopyCatchInfo - Copy catch information from DestBB to SrcBB.
-void CopyCatchInfo(const BasicBlock *SrcBB, const BasicBlock *DestBB,
-                   MachineModuleInfo *MMI, FunctionLoweringInfo &FLI);
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 16eb8a72adc8..61c2a90e7edc 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -123,7 +123,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
 
   EVT VT = Node->getValueType(ResNo);
   const TargetRegisterClass *SrcRC = 0, *DstRC = 0;
-  SrcRC = TRI->getPhysicalRegisterRegClass(SrcReg, VT);
+  SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT);
   
   // Figure out the register class to create for the destreg.
   if (VRBase) {
@@ -142,11 +142,8 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
   } else {
     // Create the reg, emit the copy.
     VRBase = MRI->createVirtualRegister(DstRC);
-    bool Emitted = TII->copyRegToReg(*MBB, InsertPos, VRBase, SrcReg,
-                                     DstRC, SrcRC, Node->getDebugLoc());
-
-    assert(Emitted && "Unable to issue a copy instruction!\n");
-    (void) Emitted;
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
+            VRBase).addReg(SrcReg);
   }
 
   SDValue Op(Node, ResNo);
@@ -246,7 +243,7 @@ unsigned InstrEmitter::getVR(SDValue Op,
       const TargetRegisterClass *RC = TLI->getRegClassFor(Op.getValueType());
       VReg = MRI->createVirtualRegister(RC);
     }
-    BuildMI(MBB, Op.getDebugLoc(),
+    BuildMI(*MBB, InsertPos, Op.getDebugLoc(),
             TII->get(TargetOpcode::IMPLICIT_DEF), VReg);
     return VReg;
   }
@@ -288,10 +285,8 @@ InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op,
            "Don't have operand info for this instruction!");
     if (DstRC && SrcRC != DstRC && !SrcRC->hasSuperClass(DstRC)) {
       unsigned NewVReg = MRI->createVirtualRegister(DstRC);
-      bool Emitted = TII->copyRegToReg(*MBB, InsertPos, NewVReg, VReg,
-                                       DstRC, SrcRC, Op.getNode()->getDebugLoc());
-      assert(Emitted && "Unable to issue a copy instruction!\n");
-      (void) Emitted;
+      BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg);
       VReg = NewVReg;
     }
   }
@@ -428,12 +423,9 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
   }
   
   if (Opc == TargetOpcode::EXTRACT_SUBREG) {
+    // EXTRACT_SUBREG is lowered as %dst = COPY %src:sub
     unsigned SubIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
 
-    // Create the extract_subreg machine instruction.
-    MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(),
-                               TII->get(TargetOpcode::EXTRACT_SUBREG));
-
     // Figure out the register class to create for the destreg.
     unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
     const TargetRegisterClass *TRC = MRI->getRegClass(VReg);
@@ -450,11 +442,16 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
       VRBase = MRI->createVirtualRegister(SRC);
     }
 
-    // Add def, source, and subreg index
-    MI->addOperand(MachineOperand::CreateReg(VRBase, true));
+    // Create the extract_subreg machine instruction.
+    MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(),
+                               TII->get(TargetOpcode::COPY), VRBase);
+
+    // Add source, and subreg index
     AddOperand(MI, Node->getOperand(0), 0, 0, VRBaseMap, /*IsDebug=*/false,
                IsClone, IsCloned);
-    MI->addOperand(MachineOperand::CreateImm(SubIdx));
+    assert(TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg()) &&
+           "Cannot yet extract from physregs");
+    MI->getOperand(1).setSubReg(SubIdx);
     MBB->insert(InsertPos, MI);
   } else if (Opc == TargetOpcode::INSERT_SUBREG ||
              Opc == TargetOpcode::SUBREG_TO_REG) {
@@ -511,18 +508,13 @@ void
 InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
                                      DenseMap<SDValue, unsigned> &VRBaseMap) {
   unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
-  const TargetRegisterClass *SrcRC = MRI->getRegClass(VReg);
 
+  // Create the new VReg in the destination class and emit a copy.
   unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
   const TargetRegisterClass *DstRC = TRI->getRegClass(DstRCIdx);
-
-  // Create the new VReg in the destination class and emit a copy.
   unsigned NewVReg = MRI->createVirtualRegister(DstRC);
-  bool Emitted = TII->copyRegToReg(*MBB, InsertPos, NewVReg, VReg,
-                                   DstRC, SrcRC, Node->getDebugLoc());
-  assert(Emitted &&
-         "Unable to issue a copy instruction for a COPY_TO_REGCLASS node!\n");
-  (void) Emitted;
+  BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
+    NewVReg).addReg(VReg);
 
   SDValue Op(Node, 0);
   bool isNew = VRBaseMap.insert(std::make_pair(Op, NewVReg)).second;
@@ -604,9 +596,10 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
   } else if (SD->getKind() == SDDbgValue::CONST) {
     const Value *V = SD->getConst();
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
-      // FIXME: SDDbgValues aren't updated with legalization, so it's possible
-      // to have i128 values in them at this point. As a crude workaround, just
-      // drop the debug info if this happens.
+      // FIXME: SDDbgValue constants aren't updated with legalization, so it's 
+      // possible to have i128 constants in them at this point. Dwarf writer
+      // does not handle i128 constants at the moment so, as a crude workaround,
+      // just drop the debug info if this happens.
       if (!CI->getValue().isSignedIntN(64))
         MIB.addReg(0U);
       else
@@ -676,6 +669,33 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
 
   // Create the new machine instruction.
   MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(), II);
+
+  // The MachineInstr constructor adds implicit-def operands. Scan through
+  // these to determine which are dead.
+  if (MI->getNumOperands() != 0 &&
+      Node->getValueType(Node->getNumValues()-1) == MVT::Flag) {
+    // First, collect all used registers.
+    SmallVector<unsigned, 8> UsedRegs;
+    for (SDNode *F = Node->getFlaggedUser(); F; F = F->getFlaggedUser())
+      if (F->getOpcode() == ISD::CopyFromReg)
+        UsedRegs.push_back(cast<RegisterSDNode>(F->getOperand(1))->getReg());
+      else {
+        // Collect declared implicit uses.
+        const TargetInstrDesc &TID = TII->get(F->getMachineOpcode());
+        UsedRegs.append(TID.getImplicitUses(),
+                        TID.getImplicitUses() + TID.getNumImplicitUses());
+        // In addition to declared implicit uses, we must also check for
+        // direct RegisterSDNode operands.
+        for (unsigned i = 0, e = F->getNumOperands(); i != e; ++i)
+          if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(F->getOperand(i))) {
+            unsigned Reg = R->getReg();
+            if (Reg != 0 && TargetRegisterInfo::isPhysicalRegister(Reg))
+              UsedRegs.push_back(Reg);
+          }
+      }
+    // Then mark unused registers as dead.
+    MI->setPhysRegsDeadExcept(UsedRegs, *TRI);
+  }
   
   // Add result register values for things that are defined by this
   // instruction.
@@ -696,16 +716,24 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
   MI->setMemRefs(cast<MachineSDNode>(Node)->memoperands_begin(),
                  cast<MachineSDNode>(Node)->memoperands_end());
 
+  // Insert the instruction into position in the block. This needs to
+  // happen before any custom inserter hook is called so that the
+  // hook knows where in the block to insert the replacement code.
+  MBB->insert(InsertPos, MI);
+
   if (II.usesCustomInsertionHook()) {
     // Insert this instruction into the basic block using a target
     // specific inserter which may returns a new basic block.
-    MBB = TLI->EmitInstrWithCustomInserter(MI, MBB);
-    InsertPos = MBB->end();
+    bool AtEnd = InsertPos == MBB->end();
+    MachineBasicBlock *NewMBB = TLI->EmitInstrWithCustomInserter(MI, MBB);
+    if (NewMBB != MBB) {
+      if (AtEnd)
+        InsertPos = NewMBB->end();
+      MBB = NewMBB;
+    }
     return;
   }
   
-  MBB->insert(InsertPos, MI);
-
   // Additional results must be an physical register def.
   if (HasPhysRegOuts) {
     for (unsigned i = II.getNumDefs(); i < NumResults; ++i) {
@@ -761,24 +789,9 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
     unsigned DestReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
     if (SrcReg == DestReg) // Coalesced away the copy? Ignore.
       break;
-      
-    const TargetRegisterClass *SrcTRC = 0, *DstTRC = 0;
-    // Get the register classes of the src/dst.
-    if (TargetRegisterInfo::isVirtualRegister(SrcReg))
-      SrcTRC = MRI->getRegClass(SrcReg);
-    else
-      SrcTRC = TRI->getPhysicalRegisterRegClass(SrcReg,SrcVal.getValueType());
 
-    if (TargetRegisterInfo::isVirtualRegister(DestReg))
-      DstTRC = MRI->getRegClass(DestReg);
-    else
-      DstTRC = TRI->getPhysicalRegisterRegClass(DestReg,
-                                            Node->getOperand(1).getValueType());
-
-    bool Emitted = TII->copyRegToReg(*MBB, InsertPos, DestReg, SrcReg,
-                                     DstTRC, SrcTRC, Node->getDebugLoc());
-    assert(Emitted && "Unable to issue a copy instruction!\n");
-    (void) Emitted;
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
+            DestReg).addReg(SrcReg);
     break;
   }
   case ISD::CopyFromReg: {
@@ -807,6 +820,12 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
     const char *AsmStr = cast<ExternalSymbolSDNode>(AsmStrV)->getSymbol();
     MI->addOperand(MachineOperand::CreateES(AsmStr));
       
+    // Add the isAlignStack bit.
+    int64_t isAlignStack =
+      cast<ConstantSDNode>(Node->getOperand(InlineAsm::Op_IsAlignStack))->
+                          getZExtValue();
+    MI->addOperand(MachineOperand::CreateImm(isAlignStack));
+
     // Add all of the operand registers to the instruction.
     for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
       unsigned Flags =
@@ -821,14 +840,22 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
         case InlineAsm::Kind_RegDef:
         for (; NumVals; --NumVals, ++i) {
           unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
-          MI->addOperand(MachineOperand::CreateReg(Reg, true));
+          // FIXME: Add dead flags for physical and virtual registers defined.
+          // For now, mark physical register defs as implicit to help fast
+          // regalloc. This makes inline asm look a lot like calls.
+          MI->addOperand(MachineOperand::CreateReg(Reg, true,
+                       /*isImp=*/ TargetRegisterInfo::isPhysicalRegister(Reg)));
         }
         break;
       case InlineAsm::Kind_RegDefEarlyClobber:
         for (; NumVals; --NumVals, ++i) {
           unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
-          MI->addOperand(MachineOperand::CreateReg(Reg, true, false, false, 
-                                                   false, false, true));
+          MI->addOperand(MachineOperand::CreateReg(Reg, /*isDef=*/ true,
+                         /*isImp=*/ TargetRegisterInfo::isPhysicalRegister(Reg),
+                                                   /*isKill=*/ false,
+                                                   /*isDead=*/ false,
+                                                   /*isUndef=*/false,
+                                                   /*isEarlyClobber=*/ true));
         }
         break;
       case InlineAsm::Kind_RegUse:  // Use of register.
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 62a37a5fd0ae..7a47da4ec52e 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -31,6 +31,7 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/DenseMap.h"
@@ -133,7 +134,7 @@ private:
   /// whose vector element type is narrower than the original shuffle type.
   /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
   SDValue ShuffleWithNarrowerEltType(EVT NVT, EVT VT, DebugLoc dl,
-                                     SDValue N1, SDValue N2, 
+                                     SDValue N1, SDValue N2,
                                      SmallVectorImpl<int> &Mask) const;
 
   bool LegalizeAllNodesNotLeadingTo(SDNode *N, SDNode *Dest,
@@ -143,6 +144,8 @@ private:
                              DebugLoc dl);
 
   SDValue ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned);
+  std::pair<SDValue, SDValue> ExpandChainLibCall(RTLIB::Libcall LC,
+                                                 SDNode *Node, bool isSigned);
   SDValue ExpandFPLibCall(SDNode *Node, RTLIB::Libcall Call_F32,
                           RTLIB::Libcall Call_F64, RTLIB::Libcall Call_F80,
                           RTLIB::Libcall Call_PPCF128);
@@ -172,6 +175,8 @@ private:
   SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
   SDValue ExpandVectorBuildThroughStack(SDNode* Node);
 
+  std::pair<SDValue, SDValue> ExpandAtomic(SDNode *Node);
+
   void ExpandNode(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   void PromoteNode(SDNode *Node, SmallVectorImpl<SDValue> &Results);
 };
@@ -181,8 +186,8 @@ private:
 /// performs the same shuffe in terms of order or result bytes, but on a type
 /// whose vector element type is narrower than the original shuffle type.
 /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
-SDValue 
-SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT,  DebugLoc dl, 
+SDValue
+SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT,  DebugLoc dl,
                                                  SDValue N1, SDValue N2,
                                              SmallVectorImpl<int> &Mask) const {
   unsigned NumMaskElts = VT.getVectorNumElements();
@@ -193,12 +198,12 @@ SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT,  DebugLoc dl,
 
   if (NumEltsGrowth == 1)
     return DAG.getVectorShuffle(NVT, dl, N1, N2, &Mask[0]);
-  
+
   SmallVector<int, 8> NewMask;
   for (unsigned i = 0; i != NumMaskElts; ++i) {
     int Idx = Mask[i];
     for (unsigned j = 0; j != NumEltsGrowth; ++j) {
-      if (Idx < 0) 
+      if (Idx < 0)
         NewMask.push_back(-1);
       else
         NewMask.push_back(Idx * NumEltsGrowth + j);
@@ -320,7 +325,8 @@ bool SelectionDAGLegalize::LegalizeAllNodesNotLeadingTo(SDNode *N, SDNode *Dest,
   bool OperandsLeadToDest = false;
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
     OperandsLeadToDest |=     // If an operand leads to Dest, so do we.
-      LegalizeAllNodesNotLeadingTo(N->getOperand(i).getNode(), Dest, NodesLeadingTo);
+      LegalizeAllNodesNotLeadingTo(N->getOperand(i).getNode(), Dest,
+                                   NodesLeadingTo);
 
   if (OperandsLeadToDest) {
     NodesLeadingTo.insert(N);
@@ -357,7 +363,7 @@ static SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP,
   EVT SVT = VT;
   while (SVT != MVT::f32) {
     SVT = (MVT::SimpleValueType)(SVT.getSimpleVT().SimpleTy - 1);
-    if (CFP->isValueValidForType(SVT, CFP->getValueAPF()) &&
+    if (ConstantFPSDNode::isValueValidForType(SVT, CFP->getValueAPF()) &&
         // Only do this if the target has a native EXTLOAD instruction from
         // smaller type.
         TLI.isLoadExtLegal(ISD::EXTLOAD, SVT) &&
@@ -372,8 +378,8 @@ static SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP,
   SDValue CPIdx = DAG.getConstantPool(LLVMC, TLI.getPointerTy());
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   if (Extend)
-    return DAG.getExtLoad(ISD::EXTLOAD, dl,
-                          OrigVT, DAG.getEntryNode(),
+    return DAG.getExtLoad(ISD::EXTLOAD, OrigVT, dl,
+                          DAG.getEntryNode(),
                           CPIdx, PseudoSourceValue::getConstantPool(),
                           0, VT, false, false, Alignment);
   return DAG.getLoad(OrigVT, dl, DAG.getEntryNode(), CPIdx,
@@ -450,7 +456,7 @@ SDValue ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
                                     8 * (StoredBytes - Offset));
 
       // Load from the stack slot.
-      SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Store, StackPtr,
+      SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, RegVT, dl, Store, StackPtr,
                                     NULL, 0, MemVT, false, false, 0);
 
       Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr,
@@ -552,7 +558,7 @@ SDValue ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
       // The last copy may be partial.  Do an extending load.
       EVT MemVT = EVT::getIntegerVT(*DAG.getContext(),
                                     8 * (LoadedBytes - Offset));
-      SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Chain, Ptr,
+      SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, RegVT, dl, Chain, Ptr,
                                     LD->getSrcValue(), SVOffset + Offset,
                                     MemVT, LD->isVolatile(),
                                     LD->isNonTemporal(),
@@ -568,7 +574,7 @@ SDValue ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
                                Stores.size());
 
       // Finally, perform the original load only redirected to the stack slot.
-      Load = DAG.getExtLoad(LD->getExtensionType(), dl, VT, TF, StackBase,
+      Load = DAG.getExtLoad(LD->getExtensionType(), VT, dl, TF, StackBase,
                             NULL, 0, LoadedVT, false, false, 0);
 
       // Callers expect a MERGE_VALUES node.
@@ -597,23 +603,23 @@ SDValue ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
   // Load the value in two parts
   SDValue Lo, Hi;
   if (TLI.isLittleEndian()) {
-    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getSrcValue(),
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, VT, dl, Chain, Ptr, LD->getSrcValue(),
                         SVOffset, NewLoadedVT, LD->isVolatile(),
                         LD->isNonTemporal(), Alignment);
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                       DAG.getConstant(IncrementSize, TLI.getPointerTy()));
-    Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getSrcValue(),
+    Hi = DAG.getExtLoad(HiExtType, VT, dl, Chain, Ptr, LD->getSrcValue(),
                         SVOffset + IncrementSize, NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), MinAlign(Alignment, IncrementSize));
+                        LD->isNonTemporal(), MinAlign(Alignment,IncrementSize));
   } else {
-    Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getSrcValue(),
+    Hi = DAG.getExtLoad(HiExtType, VT, dl, Chain, Ptr, LD->getSrcValue(),
                         SVOffset, NewLoadedVT, LD->isVolatile(),
                         LD->isNonTemporal(), Alignment);
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                       DAG.getConstant(IncrementSize, TLI.getPointerTy()));
-    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getSrcValue(),
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, VT, dl, Chain, Ptr, LD->getSrcValue(),
                         SVOffset + IncrementSize, NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), MinAlign(Alignment, IncrementSize));
+                        LD->isNonTemporal(), MinAlign(Alignment,IncrementSize));
   }
 
   // aggregate the two parts
@@ -773,7 +779,7 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
            "Unexpected illegal type!");
 
   for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i)
-    assert((isTypeLegal(Node->getOperand(i).getValueType()) || 
+    assert((isTypeLegal(Node->getOperand(i).getValueType()) ||
             Node->getOperand(i).getOpcode() == ISD::TargetConstant) &&
            "Unexpected illegal type!");
 
@@ -853,6 +859,8 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
   case ISD::MERGE_VALUES:
   case ISD::EH_RETURN:
   case ISD::FRAME_TO_ARGS_OFFSET:
+  case ISD::EH_SJLJ_SETJMP:
+  case ISD::EH_SJLJ_LONGJMP:
     // These operations lie about being legal: when they claim to be legal,
     // they should actually be expanded.
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
@@ -925,8 +933,8 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
       break;
     }
 
-    Result = DAG.UpdateNodeOperands(Result.getValue(0), Ops.data(),
-                                    Ops.size());
+    Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(), Ops.data(),
+                                            Ops.size()), 0);
     switch (Action) {
     case TargetLowering::Legal:
       for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
@@ -1000,11 +1008,11 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
                                    NodesLeadingTo);
     }
 
-    // Now that we legalized all of the inputs (which may have inserted
-    // libcalls) create the new CALLSEQ_START node.
+    // Now that we have legalized all of the inputs (which may have inserted
+    // libcalls), create the new CALLSEQ_START node.
     Tmp1 = LegalizeOp(Node->getOperand(0));  // Legalize the chain.
 
-    // Merge in the last call, to ensure that this call start after the last
+    // Merge in the last call to ensure that this call starts after the last
     // call ended.
     if (LastCALLSEQ_END.getOpcode() != ISD::EntryToken) {
       Tmp1 = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
@@ -1016,7 +1024,8 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
     if (Tmp1 != Node->getOperand(0)) {
       SmallVector<SDValue, 8> Ops(Node->op_begin(), Node->op_end());
       Ops[0] = Tmp1;
-      Result = DAG.UpdateNodeOperands(Result, &Ops[0], Ops.size());
+      Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(), &Ops[0],
+                                              Ops.size()), Result.getResNo());
     }
 
     // Remember that the CALLSEQ_START is legalized.
@@ -1058,7 +1067,9 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
       if (Tmp1 != Node->getOperand(0)) {
         SmallVector<SDValue, 8> Ops(Node->op_begin(), Node->op_end());
         Ops[0] = Tmp1;
-        Result = DAG.UpdateNodeOperands(Result, &Ops[0], Ops.size());
+        Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(),
+                                                &Ops[0], Ops.size()),
+                         Result.getResNo());
       }
     } else {
       Tmp2 = LegalizeOp(Node->getOperand(Node->getNumOperands()-1));
@@ -1067,7 +1078,9 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
         SmallVector<SDValue, 8> Ops(Node->op_begin(), Node->op_end());
         Ops[0] = Tmp1;
         Ops.back() = Tmp2;
-        Result = DAG.UpdateNodeOperands(Result, &Ops[0], Ops.size());
+        Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(),
+                                                &Ops[0], Ops.size()),
+                         Result.getResNo());
       }
     }
     assert(IsLegalizingCall && "Call sequence imbalance between start/end?");
@@ -1087,7 +1100,9 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
     ISD::LoadExtType ExtType = LD->getExtensionType();
     if (ExtType == ISD::NON_EXTLOAD) {
       EVT VT = Node->getValueType(0);
-      Result = DAG.UpdateNodeOperands(Result, Tmp1, Tmp2, LD->getOffset());
+      Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(),
+                                              Tmp1, Tmp2, LD->getOffset()),
+                       Result.getResNo());
       Tmp3 = Result.getValue(0);
       Tmp4 = Result.getValue(1);
 
@@ -1100,7 +1115,7 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           const Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
           unsigned ABIAlignment = TLI.getTargetData()->getABITypeAlignment(Ty);
           if (LD->getAlignment() < ABIAlignment){
-            Result = ExpandUnalignedLoad(cast<LoadSDNode>(Result.getNode()), 
+            Result = ExpandUnalignedLoad(cast<LoadSDNode>(Result.getNode()),
                                          DAG, TLI);
             Tmp3 = Result.getOperand(0);
             Tmp4 = Result.getOperand(1);
@@ -1166,7 +1181,7 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
         ISD::LoadExtType NewExtType =
           ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD;
 
-        Result = DAG.getExtLoad(NewExtType, dl, Node->getValueType(0),
+        Result = DAG.getExtLoad(NewExtType, Node->getValueType(0), dl,
                                 Tmp1, Tmp2, LD->getSrcValue(), SVOffset,
                                 NVT, isVolatile, isNonTemporal, Alignment);
 
@@ -1202,8 +1217,8 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
         if (TLI.isLittleEndian()) {
           // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16)
           // Load the bottom RoundWidth bits.
-          Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl,
-                              Node->getValueType(0), Tmp1, Tmp2,
+          Lo = DAG.getExtLoad(ISD::ZEXTLOAD, Node->getValueType(0), dl,
+                              Tmp1, Tmp2,
                               LD->getSrcValue(), SVOffset, RoundVT, isVolatile,
                               isNonTemporal, Alignment);
 
@@ -1211,13 +1226,13 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           IncrementSize = RoundWidth / 8;
           Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
                              DAG.getIntPtrConstant(IncrementSize));
-          Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Tmp1, Tmp2,
+          Hi = DAG.getExtLoad(ExtType, Node->getValueType(0), dl, Tmp1, Tmp2,
                               LD->getSrcValue(), SVOffset + IncrementSize,
                               ExtraVT, isVolatile, isNonTemporal,
                               MinAlign(Alignment, IncrementSize));
 
-          // Build a factor node to remember that this load is independent of the
-          // other one.
+          // Build a factor node to remember that this load is independent of
+          // the other one.
           Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
                            Hi.getValue(1));
 
@@ -1231,7 +1246,7 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           // Big endian - avoid unaligned loads.
           // EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) | ZEXTLOAD@+2:i8
           // Load the top RoundWidth bits.
-          Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Tmp1, Tmp2,
+          Hi = DAG.getExtLoad(ExtType, Node->getValueType(0), dl, Tmp1, Tmp2,
                               LD->getSrcValue(), SVOffset, RoundVT, isVolatile,
                               isNonTemporal, Alignment);
 
@@ -1239,14 +1254,14 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           IncrementSize = RoundWidth / 8;
           Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
                              DAG.getIntPtrConstant(IncrementSize));
-          Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl,
-                              Node->getValueType(0), Tmp1, Tmp2,
+          Lo = DAG.getExtLoad(ISD::ZEXTLOAD,
+                              Node->getValueType(0), dl, Tmp1, Tmp2,
                               LD->getSrcValue(), SVOffset + IncrementSize,
                               ExtraVT, isVolatile, isNonTemporal,
                               MinAlign(Alignment, IncrementSize));
 
-          // Build a factor node to remember that this load is independent of the
-          // other one.
+          // Build a factor node to remember that this load is independent of
+          // the other one.
           Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
                            Hi.getValue(1));
 
@@ -1267,7 +1282,9 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           isCustom = true;
           // FALLTHROUGH
         case TargetLowering::Legal:
-          Result = DAG.UpdateNodeOperands(Result, Tmp1, Tmp2, LD->getOffset());
+          Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(),
+                                                  Tmp1, Tmp2, LD->getOffset()),
+                           Result.getResNo());
           Tmp1 = Result.getValue(0);
           Tmp2 = Result.getValue(1);
 
@@ -1281,10 +1298,12 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
             // If this is an unaligned load and the target doesn't support it,
             // expand it.
             if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
-              const Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
-              unsigned ABIAlignment = TLI.getTargetData()->getABITypeAlignment(Ty);
+              const Type *Ty =
+                LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+              unsigned ABIAlignment =
+                TLI.getTargetData()->getABITypeAlignment(Ty);
               if (LD->getAlignment() < ABIAlignment){
-                Result = ExpandUnalignedLoad(cast<LoadSDNode>(Result.getNode()), 
+                Result = ExpandUnalignedLoad(cast<LoadSDNode>(Result.getNode()),
                                              DAG, TLI);
                 Tmp1 = Result.getOperand(0);
                 Tmp2 = Result.getOperand(1);
@@ -1310,10 +1329,11 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
             Tmp2 = LegalizeOp(Load.getValue(1));
             break;
           }
-          assert(ExtType != ISD::EXTLOAD &&"EXTLOAD should always be supported!");
+          assert(ExtType != ISD::EXTLOAD &&
+                 "EXTLOAD should always be supported!");
           // Turn the unsupported load into an EXTLOAD followed by an explicit
           // zero/sign extend inreg.
-          Result = DAG.getExtLoad(ISD::EXTLOAD, dl, Node->getValueType(0),
+          Result = DAG.getExtLoad(ISD::EXTLOAD, Node->getValueType(0), dl,
                                   Tmp1, Tmp2, LD->getSrcValue(),
                                   LD->getSrcValueOffset(), SrcVT,
                                   LD->isVolatile(), LD->isNonTemporal(),
@@ -1355,8 +1375,10 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
 
       {
         Tmp3 = LegalizeOp(ST->getValue());
-        Result = DAG.UpdateNodeOperands(Result, Tmp1, Tmp3, Tmp2,
-                                        ST->getOffset());
+        Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(),
+                                                Tmp1, Tmp3, Tmp2,
+                                                ST->getOffset()),
+                         Result.getResNo());
 
         EVT VT = Tmp3.getValueType();
         switch (TLI.getOperationAction(ISD::STORE, VT)) {
@@ -1366,7 +1388,7 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           // expand it.
           if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
             const Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
-            unsigned ABIAlignment = TLI.getTargetData()->getABITypeAlignment(Ty);
+            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
             if (ST->getAlignment() < ABIAlignment)
               Result = ExpandUnalignedStore(cast<StoreSDNode>(Result.getNode()),
                                             DAG, TLI);
@@ -1459,8 +1481,10 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
       } else {
         if (Tmp1 != ST->getChain() || Tmp3 != ST->getValue() ||
             Tmp2 != ST->getBasePtr())
-          Result = DAG.UpdateNodeOperands(Result, Tmp1, Tmp3, Tmp2,
-                                          ST->getOffset());
+          Result = SDValue(DAG.UpdateNodeOperands(Result.getNode(),
+                                                  Tmp1, Tmp3, Tmp2,
+                                                  ST->getOffset()),
+                           Result.getResNo());
 
         switch (TLI.getTruncStoreAction(ST->getValue().getValueType(), StVT)) {
         default: assert(0 && "This action is not supported yet!");
@@ -1469,7 +1493,7 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           // expand it.
           if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
             const Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
-            unsigned ABIAlignment = TLI.getTargetData()->getABITypeAlignment(Ty);
+            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
             if (ST->getAlignment() < ABIAlignment)
               Result = ExpandUnalignedStore(cast<StoreSDNode>(Result.getNode()),
                                             DAG, TLI);
@@ -1531,7 +1555,7 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
     return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, NULL, 0,
                        false, false, 0);
   else
-    return DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr,
+    return DAG.getExtLoad(ISD::EXTLOAD, Op.getValueType(), dl, Ch, StackPtr,
                           NULL, 0, Vec.getValueType().getVectorElementType(),
                           false, false, 0);
 }
@@ -1568,7 +1592,7 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
                                          Node->getOperand(i), Idx, SV, Offset,
                                          EltVT, false, false, 0));
     } else
-      Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, 
+      Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl,
                                     Node->getOperand(i), Idx, SV, Offset,
                                     false, false, 0));
   }
@@ -1763,7 +1787,7 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp,
                        DestAlign);
 
   assert(SlotSize < DestSize && "Unknown extension!");
-  return DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, Store, FIPtr, SV, 0, SlotVT,
+  return DAG.getExtLoad(ISD::EXTLOAD, DestVT, dl, Store, FIPtr, SV, 0, SlotVT,
                         false, false, DestAlign);
 }
 
@@ -1926,6 +1950,44 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
   return CallInfo.first;
 }
 
+// ExpandChainLibCall - Expand a node into a call to a libcall. Similar to
+// ExpandLibCall except that the first operand is the in-chain.
+std::pair<SDValue, SDValue>
+SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
+                                         SDNode *Node,
+                                         bool isSigned) {
+  assert(!IsLegalizingCall && "Cannot overlap legalization of calls!");
+  SDValue InChain = Node->getOperand(0);
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Node->getOperand(i).getValueType();
+    const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Node->getOperand(i);
+    Entry.Ty = ArgTy;
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy());
+
+  // Splice the libcall in wherever FindInputOutputChains tells us to.
+  const Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
+  std::pair<SDValue, SDValue> CallInfo =
+    TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+                    0, TLI.getLibcallCallingConv(LC), false,
+                    /*isReturnValueUsed=*/true,
+                    Callee, Args, DAG, Node->getDebugLoc());
+
+  // Legalize the call sequence, starting with the chain.  This will advance
+  // the LastCALLSEQ_END to the legalized version of the CALLSEQ_END node that
+  // was added by LowerCallTo (guaranteeing proper serialization of calls).
+  LegalizeOp(CallInfo.second);
+  return CallInfo;
+}
+
 SDValue SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
                                               RTLIB::Libcall Call_F32,
                                               RTLIB::Libcall Call_F64,
@@ -2048,7 +2110,8 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
     SDValue HiOr = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, TwoP84);
     SDValue LoFlt = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, LoOr);
     SDValue HiFlt = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, HiOr);
-    SDValue HiSub = DAG.getNode(ISD::FSUB, dl, MVT::f64, HiFlt, TwoP84PlusTwoP52);
+    SDValue HiSub = DAG.getNode(ISD::FSUB, dl, MVT::f64, HiFlt,
+                                TwoP84PlusTwoP52);
     return DAG.getNode(ISD::FADD, dl, MVT::f64, LoFlt, HiSub);
   }
 
@@ -2058,11 +2121,11 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
   if (Op0.getValueType() == MVT::i64 && DestVT == MVT::f32) {
     EVT SHVT = TLI.getShiftAmountTy();
 
-    SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0, 
+    SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
          DAG.getConstant(UINT64_C(0xfffffffffffff800), MVT::i64));
     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, And,
          DAG.getConstant(UINT64_C(0x800), MVT::i64));
-    SDValue And2 = DAG.getNode(ISD::AND, dl, MVT::i64, Op0, 
+    SDValue And2 = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
          DAG.getConstant(UINT64_C(0x7ff), MVT::i64));
     SDValue Ne = DAG.getSetCC(dl, TLI.getSetCCResultType(MVT::i64),
                    And2, DAG.getConstant(UINT64_C(0), MVT::i64), ISD::SETNE);
@@ -2122,7 +2185,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
                              false, false, Alignment);
   else {
     FudgeInReg =
-      LegalizeOp(DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT,
+      LegalizeOp(DAG.getExtLoad(ISD::EXTLOAD, DestVT, dl,
                                 DAG.getEntryNode(), CPIdx,
                                 PseudoSourceValue::getConstantPool(), 0,
                                 MVT::f32, false, false, Alignment));
@@ -2332,6 +2395,92 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
   }
 }
 
+std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
+  unsigned Opc = Node->getOpcode();
+  MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT();
+  RTLIB::Libcall LC;
+
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unhandled atomic intrinsic Expand!");
+    break;
+  case ISD::ATOMIC_SWAP:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_LOCK_TEST_AND_SET_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_8; break;
+    }
+    break;
+  case ISD::ATOMIC_CMP_SWAP:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_ADD:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_ADD_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_ADD_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_ADD_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_ADD_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_SUB:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_SUB_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_SUB_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_SUB_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_SUB_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_AND:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_AND_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_AND_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_AND_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_AND_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_OR:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_OR_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_OR_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_OR_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_OR_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_XOR:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_XOR_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_XOR_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_XOR_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_XOR_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_NAND:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_NAND_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_NAND_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_NAND_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_NAND_8; break;
+    }
+    break;
+  }
+
+  return ExpandChainLibCall(LC, Node, false);
+}
+
 void SelectionDAGLegalize::ExpandNode(SDNode *Node,
                                       SmallVectorImpl<SDValue> &Results) {
   DebugLoc dl = Node->getDebugLoc();
@@ -2357,10 +2506,48 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
   case ISD::EH_RETURN:
   case ISD::EH_LABEL:
   case ISD::PREFETCH:
-  case ISD::MEMBARRIER:
   case ISD::VAEND:
+  case ISD::EH_SJLJ_LONGJMP:
+    Results.push_back(Node->getOperand(0));
+    break;
+  case ISD::EH_SJLJ_SETJMP:
+    Results.push_back(DAG.getConstant(0, MVT::i32));
     Results.push_back(Node->getOperand(0));
     break;
+  case ISD::MEMBARRIER: {
+    // If the target didn't lower this, lower it to '__sync_synchronize()' call
+    TargetLowering::ArgListTy Args;
+    std::pair<SDValue, SDValue> CallResult =
+      TLI.LowerCallTo(Node->getOperand(0), Type::getVoidTy(*DAG.getContext()),
+                      false, false, false, false, 0, CallingConv::C, false,
+                      /*isReturnValueUsed=*/true,
+                      DAG.getExternalSymbol("__sync_synchronize",
+                                            TLI.getPointerTy()),
+                      Args, DAG, dl);
+    Results.push_back(CallResult.second);
+    break;
+  }
+  // By default, atomic intrinsics are marked Legal and lowered. Targets
+  // which don't support them directly, however, may want libcalls, in which
+  // case they mark them Expand, and we get here.
+  // FIXME: Unimplemented for now. Add libcalls.
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_CMP_SWAP: {
+    std::pair<SDValue, SDValue> Tmp = ExpandAtomic(Node);
+    Results.push_back(Tmp.first);
+    Results.push_back(Tmp.second);
+    break;
+  }
   case ISD::DYNAMIC_STACKALLOC:
     ExpandDYNAMIC_STACKALLOC(Node, Results);
     break;
@@ -2465,15 +2652,31 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
     EVT VT = Node->getValueType(0);
     Tmp1 = Node->getOperand(0);
     Tmp2 = Node->getOperand(1);
-    SDValue VAList = DAG.getLoad(TLI.getPointerTy(), dl, Tmp1, Tmp2, V, 0,
-                                 false, false, 0);
+    unsigned Align = Node->getConstantOperandVal(3);
+
+    SDValue VAListLoad = DAG.getLoad(TLI.getPointerTy(), dl, Tmp1, Tmp2, V, 0,
+                                     false, false, 0);
+    SDValue VAList = VAListLoad;
+
+    if (Align > TLI.getMinStackArgumentAlignment()) {
+      assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
+
+      VAList = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), VAList,
+                           DAG.getConstant(Align - 1,
+                                           TLI.getPointerTy()));
+
+      VAList = DAG.getNode(ISD::AND, dl, TLI.getPointerTy(), VAList,
+                           DAG.getConstant(-Align,
+                                           TLI.getPointerTy()));
+    }
+
     // Increment the pointer, VAList, to the next vaarg
     Tmp3 = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), VAList,
                        DAG.getConstant(TLI.getTargetData()->
                           getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext())),
                                        TLI.getPointerTy()));
     // Store the incremented VAList to the legalized pointer
-    Tmp3 = DAG.getStore(VAList.getValue(1), dl, Tmp3, Tmp2, V, 0,
+    Tmp3 = DAG.getStore(VAListLoad.getValue(1), dl, Tmp3, Tmp2, V, 0,
                         false, false, 0);
     // Load the actual argument out of the pointer VAList
     Results.push_back(DAG.getLoad(VT, dl, Tmp3, VAList, NULL, 0,
@@ -2496,7 +2699,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
   case ISD::EXTRACT_VECTOR_ELT:
     if (Node->getOperand(0).getValueType().getVectorNumElements() == 1)
       // This must be an access of the only element.  Return it.
-      Tmp1 = DAG.getNode(ISD::BIT_CONVERT, dl, Node->getValueType(0), 
+      Tmp1 = DAG.getNode(ISD::BIT_CONVERT, dl, Node->getValueType(0),
                          Node->getOperand(0));
     else
       Tmp1 = ExpandExtractFromVectorThroughStack(SDValue(Node, 0));
@@ -2948,13 +3151,13 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
     const TargetData &TD = *TLI.getTargetData();
     unsigned EntrySize =
       DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(TD);
-    
+
     Index = DAG.getNode(ISD::MUL, dl, PTy,
                         Index, DAG.getConstant(EntrySize, PTy));
     SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
 
     EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
-    SDValue LD = DAG.getExtLoad(ISD::SEXTLOAD, dl, PTy, Chain, Addr,
+    SDValue LD = DAG.getExtLoad(ISD::SEXTLOAD, PTy, dl, Chain, Addr,
                                 PseudoSourceValue::getJumpTable(), 0, MemVT,
                                 false, false, 0);
     Addr = LD;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index e3eb949567a3..650ee5a0721c 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -453,8 +453,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
 
   SDValue NewL;
   if (L->getExtensionType() == ISD::NON_EXTLOAD) {
-    NewL = DAG.getLoad(L->getAddressingMode(), dl, L->getExtensionType(),
-                       NVT, L->getChain(), L->getBasePtr(), L->getOffset(),
+    NewL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(),
+                       NVT, dl, L->getChain(), L->getBasePtr(), L->getOffset(),
                        L->getSrcValue(), L->getSrcValueOffset(), NVT,
                        L->isVolatile(), L->isNonTemporal(), L->getAlignment());
     // Legalized the chain result - switch anything that used the old chain to
@@ -464,8 +464,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
   }
 
   // Do a non-extending load followed by FP_EXTEND.
-  NewL = DAG.getLoad(L->getAddressingMode(), dl, ISD::NON_EXTLOAD,
-                     L->getMemoryVT(), L->getChain(),
+  NewL = DAG.getLoad(L->getAddressingMode(), ISD::NON_EXTLOAD,
+                     L->getMemoryVT(), dl, L->getChain(),
                      L->getBasePtr(), L->getOffset(),
                      L->getSrcValue(), L->getSrcValueOffset(),
                      L->getMemoryVT(), L->isVolatile(),
@@ -504,7 +504,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_VAARG(SDNode *N) {
   DebugLoc dl = N->getDebugLoc();
 
   SDValue NewVAARG;
-  NewVAARG = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2));
+  NewVAARG = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2),
+                          N->getConstantOperandVal(3));
 
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
@@ -698,9 +699,10 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
   }
 
   // Update N to have the operands specified.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
                                 DAG.getCondCode(CCCode), NewLHS, NewRHS,
-                                N->getOperand(4));
+                                N->getOperand(4)),
+                 0);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_SINT(SDNode *N) {
@@ -739,9 +741,10 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
   }
 
   // Update N to have the operands specified.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), NewLHS, NewRHS,
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
                                 N->getOperand(2), N->getOperand(3),
-                                DAG.getCondCode(CCCode));
+                                DAG.getCondCode(CCCode)),
+                 0);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_SETCC(SDNode *N) {
@@ -757,8 +760,9 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_SETCC(SDNode *N) {
   }
 
   // Otherwise, update N to have the operands specified.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), NewLHS, NewRHS,
-                                DAG.getCondCode(CCCode));
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                DAG.getCondCode(CCCode)),
+                 0);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_STORE(SDNode *N, unsigned OpNo) {
@@ -1106,7 +1110,7 @@ void DAGTypeLegalizer::ExpandFloatRes_LOAD(SDNode *N, SDValue &Lo,
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
   assert(LD->getMemoryVT().bitsLE(NVT) && "Float type not round?");
 
-  Hi = DAG.getExtLoad(LD->getExtensionType(), dl, NVT, Chain, Ptr,
+  Hi = DAG.getExtLoad(LD->getExtensionType(), NVT, dl, Chain, Ptr,
                       LD->getSrcValue(), LD->getSrcValueOffset(),
                       LD->getMemoryVT(), LD->isVolatile(),
                       LD->isNonTemporal(), LD->getAlignment());
@@ -1294,9 +1298,9 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_BR_CC(SDNode *N) {
   }
 
   // Update N to have the operands specified.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
                                 DAG.getCondCode(CCCode), NewLHS, NewRHS,
-                                N->getOperand(4));
+                                N->getOperand(4)), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_FP_ROUND(SDNode *N) {
@@ -1375,9 +1379,9 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_SELECT_CC(SDNode *N) {
   }
 
   // Update N to have the operands specified.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), NewLHS, NewRHS,
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
                                 N->getOperand(2), N->getOperand(3),
-                                DAG.getCondCode(CCCode));
+                                DAG.getCondCode(CCCode)), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_SETCC(SDNode *N) {
@@ -1393,8 +1397,8 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_SETCC(SDNode *N) {
   }
 
   // Otherwise, update N to have the operands specified.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), NewLHS, NewRHS,
-                                DAG.getCondCode(CCCode));
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                DAG.getCondCode(CCCode)), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_STORE(SDNode *N, unsigned OpNo) {
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 8b382bc7670d..b94ea9a3a9af 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -369,7 +369,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) {
   ISD::LoadExtType ExtType =
     ISD::isNON_EXTLoad(N) ? ISD::EXTLOAD : N->getExtensionType();
   DebugLoc dl = N->getDebugLoc();
-  SDValue Res = DAG.getExtLoad(ExtType, dl, NVT, N->getChain(), N->getBasePtr(),
+  SDValue Res = DAG.getExtLoad(ExtType, NVT, dl, N->getChain(), N->getBasePtr(),
                                N->getSrcValue(), N->getSrcValueOffset(),
                                N->getMemoryVT(), N->isVolatile(),
                                N->isNonTemporal(), N->getAlignment());
@@ -572,7 +572,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) {
 
   SmallVector<SDValue, 8> Parts(NumRegs);
   for (unsigned i = 0; i < NumRegs; ++i) {
-    Parts[i] = DAG.getVAArg(RegVT, dl, Chain, Ptr, N->getOperand(2));
+    Parts[i] = DAG.getVAArg(RegVT, dl, Chain, Ptr, N->getOperand(2),
+                            N->getConstantOperandVal(3));
     Chain = Parts[i].getValue(1);
   }
 
@@ -725,8 +726,9 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo) {
 
   // The chain (Op#0), CC (#1) and basic block destination (Op#4) are always
   // legal types.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
-                                N->getOperand(1), LHS, RHS, N->getOperand(4));
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                N->getOperand(1), LHS, RHS, N->getOperand(4)),
+                 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo) {
@@ -737,8 +739,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo) {
   SDValue Cond = PromoteTargetBoolean(N->getOperand(1), SVT);
 
   // The chain (Op#0) and basic block destination (Op#2) are always legal types.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0), Cond,
-                                N->getOperand(2));
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Cond,
+                                        N->getOperand(2)), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_PAIR(SDNode *N) {
@@ -773,7 +775,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_VECTOR(SDNode *N) {
   for (unsigned i = 0; i < NumElts; ++i)
     NewOps.push_back(GetPromotedInteger(N->getOperand(i)));
 
-  return DAG.UpdateNodeOperands(SDValue(N, 0), &NewOps[0], NumElts);
+  return SDValue(DAG.UpdateNodeOperands(N, &NewOps[0], NumElts), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_CONVERT_RNDSAT(SDNode *N) {
@@ -798,17 +800,18 @@ SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N,
     assert(N->getOperand(1).getValueType().getSizeInBits() >=
            N->getValueType(0).getVectorElementType().getSizeInBits() &&
            "Type of inserted value narrower than vector element type!");
-    return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
+    return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
                                   GetPromotedInteger(N->getOperand(1)),
-                                  N->getOperand(2));
+                                  N->getOperand(2)),
+                   0);
   }
 
   assert(OpNo == 2 && "Different operand and result vector types?");
 
   // Promote the index.
   SDValue Idx = ZExtPromotedInteger(N->getOperand(2));
-  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
-                                N->getOperand(1), Idx);
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                N->getOperand(1), Idx), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_MEMBARRIER(SDNode *N) {
@@ -819,15 +822,14 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MEMBARRIER(SDNode *N) {
     SDValue Flag = GetPromotedInteger(N->getOperand(i));
     NewOps[i] = DAG.getZeroExtendInReg(Flag, dl, MVT::i1);
   }
-  return DAG.UpdateNodeOperands(SDValue (N, 0), NewOps,
-                                array_lengthof(NewOps));
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps, array_lengthof(NewOps)), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N) {
   // Integer SCALAR_TO_VECTOR operands are implicitly truncated, so just promote
   // the operand in place.
-  return DAG.UpdateNodeOperands(SDValue(N, 0),
-                                GetPromotedInteger(N->getOperand(0)));
+  return SDValue(DAG.UpdateNodeOperands(N,
+                                GetPromotedInteger(N->getOperand(0))), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
@@ -837,8 +839,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
   EVT SVT = TLI.getSetCCResultType(N->getOperand(1).getValueType());
   SDValue Cond = PromoteTargetBoolean(N->getOperand(0), SVT);
 
-  return DAG.UpdateNodeOperands(SDValue(N, 0), Cond,
-                                N->getOperand(1), N->getOperand(2));
+  return SDValue(DAG.UpdateNodeOperands(N, Cond,
+                                N->getOperand(1), N->getOperand(2)), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo) {
@@ -849,8 +851,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo) {
   PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(4))->get());
 
   // The CC (#4) and the possible return values (#2 and #3) have legal types.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), LHS, RHS, N->getOperand(2),
-                                N->getOperand(3), N->getOperand(4));
+  return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2),
+                                N->getOperand(3), N->getOperand(4)), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SETCC(SDNode *N, unsigned OpNo) {
@@ -861,12 +863,12 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SETCC(SDNode *N, unsigned OpNo) {
   PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(2))->get());
 
   // The CC (#2) is always legal.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), LHS, RHS, N->getOperand(2));
+  return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2)), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_Shift(SDNode *N) {
-  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
-                                ZExtPromotedInteger(N->getOperand(1)));
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                ZExtPromotedInteger(N->getOperand(1))), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SIGN_EXTEND(SDNode *N) {
@@ -878,8 +880,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SIGN_EXTEND(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SINT_TO_FP(SDNode *N) {
-  return DAG.UpdateNodeOperands(SDValue(N, 0),
-                                SExtPromotedInteger(N->getOperand(0)));
+  return SDValue(DAG.UpdateNodeOperands(N,
+                                SExtPromotedInteger(N->getOperand(0))), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
@@ -905,8 +907,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_UINT_TO_FP(SDNode *N) {
-  return DAG.UpdateNodeOperands(SDValue(N, 0),
-                                ZExtPromotedInteger(N->getOperand(0)));
+  return SDValue(DAG.UpdateNodeOperands(N,
+                                ZExtPromotedInteger(N->getOperand(0))), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_ZERO_EXTEND(SDNode *N) {
@@ -990,6 +992,11 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL: ExpandIntRes_Shift(N, Lo, Hi); break;
+
+  case ISD::SADDO:
+  case ISD::SSUBO: ExpandIntRes_SADDSUBO(N, Lo, Hi); break;
+  case ISD::UADDO:
+  case ISD::USUBO: ExpandIntRes_UADDSUBO(N, Lo, Hi); break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -1526,7 +1533,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
   if (N->getMemoryVT().bitsLE(NVT)) {
     EVT MemVT = N->getMemoryVT();
 
-    Lo = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getSrcValue(), SVOffset,
+    Lo = DAG.getExtLoad(ExtType, NVT, dl, Ch, Ptr, N->getSrcValue(), SVOffset,
                         MemVT, isVolatile, isNonTemporal, Alignment);
 
     // Remember the chain.
@@ -1559,7 +1566,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     unsigned IncrementSize = NVT.getSizeInBits()/8;
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                       DAG.getIntPtrConstant(IncrementSize));
-    Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getSrcValue(),
+    Hi = DAG.getExtLoad(ExtType, NVT, dl, Ch, Ptr, N->getSrcValue(),
                         SVOffset+IncrementSize, NEVT,
                         isVolatile, isNonTemporal,
                         MinAlign(Alignment, IncrementSize));
@@ -1577,7 +1584,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     unsigned ExcessBits = (EBytes - IncrementSize)*8;
 
     // Load both the high bits and maybe some of the low bits.
-    Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getSrcValue(), SVOffset,
+    Hi = DAG.getExtLoad(ExtType, NVT, dl, Ch, Ptr, N->getSrcValue(), SVOffset,
                         EVT::getIntegerVT(*DAG.getContext(),
                                           MemVT.getSizeInBits() - ExcessBits),
                         isVolatile, isNonTemporal, Alignment);
@@ -1586,7 +1593,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                       DAG.getIntPtrConstant(IncrementSize));
     // Load the rest of the low bits.
-    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, NVT, Ch, Ptr, N->getSrcValue(),
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, NVT, dl, Ch, Ptr, N->getSrcValue(),
                         SVOffset+IncrementSize,
                         EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
                         isVolatile, isNonTemporal,
@@ -1716,6 +1723,48 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
   SplitInteger(MakeLibCall(LC, VT, Ops, 2, true/*irrelevant*/, dl), Lo, Hi);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue LHS = Node->getOperand(0);
+  SDValue RHS = Node->getOperand(1);
+  DebugLoc dl = Node->getDebugLoc();
+
+  // Expand the result by simply replacing it with the equivalent
+  // non-overflow-checking operation.
+  SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::SADDO ?
+                            ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                            LHS, RHS);
+  SplitInteger(Sum, Lo, Hi);
+
+  // Compute the overflow.
+  //
+  //   LHSSign -> LHS >= 0
+  //   RHSSign -> RHS >= 0
+  //   SumSign -> Sum >= 0
+  //
+  //   Add:
+  //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
+  //   Sub:
+  //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
+  //
+  EVT OType = Node->getValueType(1);
+  SDValue Zero = DAG.getConstant(0, LHS.getValueType());
+
+  SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
+  SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
+  SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
+                                    Node->getOpcode() == ISD::SADDO ?
+                                    ISD::SETEQ : ISD::SETNE);
+
+  SDValue SumSign = DAG.getSetCC(dl, OType, Sum, Zero, ISD::SETGE);
+  SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
+
+  SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(Node, 1), Cmp);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
@@ -1912,6 +1961,29 @@ void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N,
   Hi = DAG.getNode(ISD::TRUNCATE, dl, NVT, Hi);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  DebugLoc dl = N->getDebugLoc();
+
+  // Expand the result by simply replacing it with the equivalent
+  // non-overflow-checking operation.
+  SDValue Sum = DAG.getNode(N->getOpcode() == ISD::UADDO ?
+                            ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                            LHS, RHS);
+  SplitInteger(Sum, Lo, Hi);
+
+  // Calculate the overflow: addition overflows iff a + b < a, and subtraction
+  // overflows iff a - b > a.
+  SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Sum, LHS,
+                             N->getOpcode () == ISD::UADDO ?
+                             ISD::SETULT : ISD::SETUGT);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(N, 1), Ofl);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
@@ -2154,9 +2226,9 @@ SDValue DAGTypeLegalizer::ExpandIntOp_BR_CC(SDNode *N) {
   }
 
   // Update N to have the operands specified.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0),
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
                                 DAG.getCondCode(CCCode), NewLHS, NewRHS,
-                                N->getOperand(4));
+                                N->getOperand(4)), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_SELECT_CC(SDNode *N) {
@@ -2172,9 +2244,9 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SELECT_CC(SDNode *N) {
   }
 
   // Update N to have the operands specified.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), NewLHS, NewRHS,
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
                                 N->getOperand(2), N->getOperand(3),
-                                DAG.getCondCode(CCCode));
+                                DAG.getCondCode(CCCode)), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) {
@@ -2190,8 +2262,8 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) {
   }
 
   // Otherwise, update N to have the operands specified.
-  return DAG.UpdateNodeOperands(SDValue(N, 0), NewLHS, NewRHS,
-                                DAG.getCondCode(CCCode));
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                DAG.getCondCode(CCCode)), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) {
@@ -2200,7 +2272,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) {
   // upper half of the shift amount is zero.  Just use the lower half.
   SDValue Lo, Hi;
   GetExpandedInteger(N->getOperand(1), Lo, Hi);
-  return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0), Lo);
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Lo), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_RETURNADDR(SDNode *N) {
@@ -2209,7 +2281,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_RETURNADDR(SDNode *N) {
   // constant to valid type.
   SDValue Lo, Hi;
   GetExpandedInteger(N->getOperand(0), Lo, Hi);
-  return DAG.UpdateNodeOperands(SDValue(N, 0), Lo);
+  return SDValue(DAG.UpdateNodeOperands(N, Lo), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) {
@@ -2384,7 +2456,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
 
     // Load the value out, extending it from f32 to the destination float type.
     // FIXME: Avoid the extend by constructing the right constant pool?
-    SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, DstVT, DAG.getEntryNode(),
+    SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, DstVT, dl, DAG.getEntryNode(),
                                    FudgePtr, NULL, 0, MVT::f32,
                                    false, false, Alignment);
     return DAG.getNode(ISD::FADD, dl, DstVT, SignedConv, Fudge);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 17f131b21e4a..6e56c98e9b56 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -485,15 +485,14 @@ SDNode *DAGTypeLegalizer::AnalyzeNewNode(SDNode *N) {
       NewOps.push_back(Op);
     } else if (Op != OrigOp) {
       // This is the first operand to change - add all operands so far.
-      NewOps.insert(NewOps.end(), N->op_begin(), N->op_begin() + i);
+      NewOps.append(N->op_begin(), N->op_begin() + i);
       NewOps.push_back(Op);
     }
   }
 
   // Some operands changed - update the node.
   if (!NewOps.empty()) {
-    SDNode *M = DAG.UpdateNodeOperands(SDValue(N, 0), &NewOps[0],
-                                       NewOps.size()).getNode();
+    SDNode *M = DAG.UpdateNodeOperands(N, &NewOps[0], NewOps.size());
     if (M != N) {
       // The node morphed into a different node.  Normally for this to happen
       // the original node would have to be marked NewNode.  However this can
@@ -684,40 +683,45 @@ void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {
   // can potentially cause recursive merging.
   SmallSetVector<SDNode*, 16> NodesToAnalyze;
   NodeUpdateListener NUL(*this, NodesToAnalyze);
-  DAG.ReplaceAllUsesOfValueWith(From, To, &NUL);
-
-  // The old node may still be present in a map like ExpandedIntegers or
-  // PromotedIntegers.  Inform maps about the replacement.
-  ReplacedValues[From] = To;
-
-  // Process the list of nodes that need to be reanalyzed.
-  while (!NodesToAnalyze.empty()) {
-    SDNode *N = NodesToAnalyze.back();
-    NodesToAnalyze.pop_back();
-    if (N->getNodeId() != DAGTypeLegalizer::NewNode)
-      // The node was analyzed while reanalyzing an earlier node - it is safe to
-      // skip.  Note that this is not a morphing node - otherwise it would still
-      // be marked NewNode.
-      continue;
+  do {
+    DAG.ReplaceAllUsesOfValueWith(From, To, &NUL);
+
+    // The old node may still be present in a map like ExpandedIntegers or
+    // PromotedIntegers.  Inform maps about the replacement.
+    ReplacedValues[From] = To;
+
+    // Process the list of nodes that need to be reanalyzed.
+    while (!NodesToAnalyze.empty()) {
+      SDNode *N = NodesToAnalyze.back();
+      NodesToAnalyze.pop_back();
+      if (N->getNodeId() != DAGTypeLegalizer::NewNode)
+        // The node was analyzed while reanalyzing an earlier node - it is safe
+        // to skip.  Note that this is not a morphing node - otherwise it would
+        // still be marked NewNode.
+        continue;
 
-    // Analyze the node's operands and recalculate the node ID.
-    SDNode *M = AnalyzeNewNode(N);
-    if (M != N) {
-      // The node morphed into a different node.  Make everyone use the new node
-      // instead.
-      assert(M->getNodeId() != NewNode && "Analysis resulted in NewNode!");
-      assert(N->getNumValues() == M->getNumValues() &&
-             "Node morphing changed the number of results!");
-      for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
-        SDValue OldVal(N, i);
-        SDValue NewVal(M, i);
-        if (M->getNodeId() == Processed)
-          RemapValue(NewVal);
-        DAG.ReplaceAllUsesOfValueWith(OldVal, NewVal, &NUL);
+      // Analyze the node's operands and recalculate the node ID.
+      SDNode *M = AnalyzeNewNode(N);
+      if (M != N) {
+        // The node morphed into a different node.  Make everyone use the new
+        // node instead.
+        assert(M->getNodeId() != NewNode && "Analysis resulted in NewNode!");
+        assert(N->getNumValues() == M->getNumValues() &&
+               "Node morphing changed the number of results!");
+        for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+          SDValue OldVal(N, i);
+          SDValue NewVal(M, i);
+          if (M->getNodeId() == Processed)
+            RemapValue(NewVal);
+          DAG.ReplaceAllUsesOfValueWith(OldVal, NewVal, &NUL);
+        }
+        // The original node continues to exist in the DAG, marked NewNode.
       }
-      // The original node continues to exist in the DAG, marked NewNode.
     }
-  }
+    // When recursively update nodes with new nodes, it is possible to have
+    // new uses of From due to CSE. If this happens, replace the new uses of
+    // From with To.
+  } while (!From.use_empty());
 }
 
 void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) {
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index c6659630ede2..bd86694446d6 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -345,6 +345,9 @@ private:
   void ExpandIntRes_UREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_Shift             (SDNode *N, SDValue &Lo, SDValue &Hi);
 
+  void ExpandIntRes_SADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_UADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
+
   void ExpandShiftByConstant(SDNode *N, unsigned Amt,
                              SDValue &Lo, SDValue &Hi);
   bool ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -620,6 +623,7 @@ private:
 
   SDValue WidenVecRes_Binary(SDNode *N);
   SDValue WidenVecRes_Convert(SDNode *N);
+  SDValue WidenVecRes_POWI(SDNode *N);
   SDValue WidenVecRes_Shift(SDNode *N);
   SDValue WidenVecRes_Unary(SDNode *N);
   SDValue WidenVecRes_InregOp(SDNode *N);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 88e1e624ae32..9c2b1d9ed73d 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -238,13 +238,15 @@ void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
 }
 
 void DAGTypeLegalizer::ExpandRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT OVT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
   SDValue Chain = N->getOperand(0);
   SDValue Ptr = N->getOperand(1);
   DebugLoc dl = N->getDebugLoc();
+  const unsigned Align = N->getConstantOperandVal(3);
 
-  Lo = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2));
-  Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, N->getOperand(2));
+  Lo = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2), Align);
+  Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, N->getOperand(2), 0);
 
   // Handle endianness of the load.
   if (TLI.isBigEndian())
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 0e2bd0233712..621c08724210 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -116,7 +116,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
     Ops.push_back(LegalizeOp(Node->getOperand(i)));
 
   SDValue Result =
-      DAG.UpdateNodeOperands(Op.getValue(0), Ops.data(), Ops.size());
+    SDValue(DAG.UpdateNodeOperands(Op.getNode(), Ops.data(), Ops.size()), 0);
 
   bool HasVectorValue = false;
   for (SDNode::value_iterator J = Node->value_begin(), E = Node->value_end();
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 7efeea1ddaf9..93aeff5c1e6c 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -165,9 +165,10 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) {
 SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
   assert(N->isUnindexed() && "Indexed vector load?");
 
-  SDValue Result = DAG.getLoad(ISD::UNINDEXED, N->getDebugLoc(),
+  SDValue Result = DAG.getLoad(ISD::UNINDEXED,
                                N->getExtensionType(),
                                N->getValueType(0).getVectorElementType(),
+                               N->getDebugLoc(),
                                N->getChain(), N->getBasePtr(),
                                DAG.getUNDEF(N->getBasePtr().getValueType()),
                                N->getSrcValue(), N->getSrcValueOffset(),
@@ -448,6 +449,11 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
     SplitVecRes_UnaryOp(N, Lo, Hi);
     break;
 
@@ -755,14 +761,14 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
   EVT LoMemVT, HiMemVT;
   GetSplitDestVTs(MemoryVT, LoMemVT, HiMemVT);
 
-  Lo = DAG.getLoad(ISD::UNINDEXED, dl, ExtType, LoVT, Ch, Ptr, Offset,
+  Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset,
                    SV, SVOffset, LoMemVT, isVolatile, isNonTemporal, Alignment);
 
   unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                     DAG.getIntPtrConstant(IncrementSize));
   SVOffset += IncrementSize;
-  Hi = DAG.getLoad(ISD::UNINDEXED, dl, ExtType, HiVT, Ch, Ptr, Offset,
+  Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset,
                    SV, SVOffset, HiMemVT, isVolatile, isNonTemporal, Alignment);
 
   // Build a factor node to remember that this load is independent of the
@@ -1082,10 +1088,11 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
     uint64_t LoElts = Lo.getValueType().getVectorNumElements();
 
     if (IdxVal < LoElts)
-      return DAG.UpdateNodeOperands(SDValue(N, 0), Lo, Idx);
-    return DAG.UpdateNodeOperands(SDValue(N, 0), Hi,
+      return SDValue(DAG.UpdateNodeOperands(N, Lo, Idx), 0);
+    return SDValue(DAG.UpdateNodeOperands(N, Hi,
                                   DAG.getConstant(IdxVal - LoElts,
-                                                  Idx.getValueType()));
+                                                  Idx.getValueType())),
+                   0);
   }
 
   // Store the vector to the stack.
@@ -1099,7 +1106,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
 
   // Load back the required element.
   StackPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
-  return DAG.getExtLoad(ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
+  return DAG.getExtLoad(ISD::EXTLOAD, N->getValueType(0), dl, Store, StackPtr,
                         SV, 0, EltVT, false, false, 0);
 }
 
@@ -1199,7 +1206,6 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FDIV:
   case ISD::FMUL:
   case ISD::FPOW:
-  case ISD::FPOWI:
   case ISD::FREM:
   case ISD::FSUB:
   case ISD::MUL:
@@ -1215,6 +1221,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     Res = WidenVecRes_Binary(N);
     break;
 
+  case ISD::FPOWI:
+    Res = WidenVecRes_POWI(N);
+    break;
+
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
@@ -1241,6 +1251,11 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FNEG:
   case ISD::FSIN:
   case ISD::FSQRT:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
     Res = WidenVecRes_Unary(N);
     break;
   }
@@ -1258,7 +1273,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
   EVT WidenEltVT = WidenVT.getVectorElementType();
   EVT VT = WidenVT;
   unsigned NumElts =  VT.getVectorNumElements();
-  while (!TLI.isTypeLegal(VT) && NumElts != 1) {
+  while (!TLI.isTypeSynthesizable(VT) && NumElts != 1) {
      NumElts = NumElts / 2;
      VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
   }
@@ -1273,13 +1288,20 @@ SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
     return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
   } else {
     // Since the operation can trap, apply operation on the original vector.
+    EVT MaxVT = VT;
     SDValue InOp1 = GetWidenedVector(N->getOperand(0));
     SDValue InOp2 = GetWidenedVector(N->getOperand(1));
     unsigned CurNumElts = N->getValueType(0).getVectorNumElements();
 
     SmallVector<SDValue, 16> ConcatOps(CurNumElts);
     unsigned ConcatEnd = 0;  // Current ConcatOps index.
-    unsigned Idx = 0;        // Current Idx into input vectors.
+    int Idx = 0;        // Current Idx into input vectors.
+
+    // NumElts := greatest synthesizable vector size (at most WidenVT)
+    // while (orig. vector has unhandled elements) {
+    //   take munches of size NumElts from the beginning and add to ConcatOps
+    //   NumElts := next smaller supported vector size or 1
+    // }
     while (CurNumElts != 0) {
       while (CurNumElts >= NumElts) {
         SDValue EOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp1,
@@ -1290,26 +1312,21 @@ SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
         Idx += NumElts;
         CurNumElts -= NumElts;
       }
-      EVT PrevVecVT = VT;
       do {
         NumElts = NumElts / 2;
         VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
-      } while (!TLI.isTypeLegal(VT) && NumElts != 1);
+      } while (!TLI.isTypeSynthesizable(VT) && NumElts != 1);
 
       if (NumElts == 1) {
-        // Since we are using concat vector, build a vector from the scalar ops.
-        SDValue VecOp = DAG.getUNDEF(PrevVecVT);
         for (unsigned i = 0; i != CurNumElts; ++i, ++Idx) {
           SDValue EOp1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, 
                                      InOp1, DAG.getIntPtrConstant(Idx));
           SDValue EOp2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, 
                                      InOp2, DAG.getIntPtrConstant(Idx));
-          VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, PrevVecVT, VecOp,
-                              DAG.getNode(Opcode, dl, WidenEltVT, EOp1, EOp2),
-                              DAG.getIntPtrConstant(i));
+          ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, WidenEltVT,
+                                               EOp1, EOp2);
         }
         CurNumElts = 0;
-        ConcatOps[ConcatEnd++] = VecOp;
       }
     }
 
@@ -1320,23 +1337,65 @@ SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
         return ConcatOps[0];
     }
 
-    // Rebuild vector to one with the widen type
-    Idx = ConcatEnd - 1;
-    while (Idx != 0) {
+    // while (Some element of ConcatOps is not of type MaxVT) {
+    //   From the end of ConcatOps, collect elements of the same type and put
+    //   them into an op of the next larger supported type
+    // }
+    while (ConcatOps[ConcatEnd-1].getValueType() != MaxVT) {
+      Idx = ConcatEnd - 1;
       VT = ConcatOps[Idx--].getValueType();
-      while (Idx != 0 && ConcatOps[Idx].getValueType() == VT)
-        --Idx;
-      if (Idx != 0) {
-        VT = ConcatOps[Idx].getValueType();
-        ConcatOps[Idx+1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
-                                     &ConcatOps[Idx+1], ConcatEnd - Idx - 1);
+      while (Idx >= 0 && ConcatOps[Idx].getValueType() == VT)
+        Idx--;
+
+      int NextSize = VT.isVector() ? VT.getVectorNumElements() : 1;
+      EVT NextVT;
+      do {
+        NextSize *= 2;
+        NextVT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NextSize);
+      } while (!TLI.isTypeSynthesizable(NextVT));
+
+      if (!VT.isVector()) {
+        // Scalar type, create an INSERT_VECTOR_ELEMENT of type NextVT
+        SDValue VecOp = DAG.getUNDEF(NextVT);
+        unsigned NumToInsert = ConcatEnd - Idx - 1;
+        for (unsigned i = 0, OpIdx = Idx+1; i < NumToInsert; i++, OpIdx++) {
+          VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NextVT, VecOp,
+                              ConcatOps[OpIdx], DAG.getIntPtrConstant(i));
+        }
+        ConcatOps[Idx+1] = VecOp;
         ConcatEnd = Idx + 2;
+      } 
+      else {
+        // Vector type, create a CONCAT_VECTORS of type NextVT
+        SDValue undefVec = DAG.getUNDEF(VT);
+        unsigned OpsToConcat = NextSize/VT.getVectorNumElements();
+        SmallVector<SDValue, 16> SubConcatOps(OpsToConcat);
+        unsigned RealVals = ConcatEnd - Idx - 1;
+        unsigned SubConcatEnd = 0;
+        unsigned SubConcatIdx = Idx + 1;
+        while (SubConcatEnd < RealVals)
+          SubConcatOps[SubConcatEnd++] = ConcatOps[++Idx];
+        while (SubConcatEnd < OpsToConcat)
+          SubConcatOps[SubConcatEnd++] = undefVec;
+        ConcatOps[SubConcatIdx] = DAG.getNode(ISD::CONCAT_VECTORS, dl,
+                                              NextVT, &SubConcatOps[0],
+                                              OpsToConcat);
+        ConcatEnd = SubConcatIdx + 1;
       }
     }
+
+    // Check to see if we have a single operation with the widen type.
+    if (ConcatEnd == 1) {
+      VT = ConcatOps[0].getValueType();
+      if (VT == WidenVT)
+        return ConcatOps[0];
+    }
     
-    unsigned NumOps = WidenVT.getVectorNumElements()/VT.getVectorNumElements();
+    // add undefs of size MaxVT until ConcatOps grows to length of WidenVT
+    unsigned NumOps = 
+        WidenVT.getVectorNumElements()/MaxVT.getVectorNumElements();
     if (NumOps != ConcatEnd ) {
-      SDValue UndefVal = DAG.getUNDEF(VT);
+      SDValue UndefVal = DAG.getUNDEF(MaxVT);
       for (unsigned j = ConcatEnd; j < NumOps; ++j)
         ConcatOps[j] = UndefVal;
     }
@@ -1366,7 +1425,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
       return DAG.getNode(Opcode, dl, WidenVT, InOp);
   }
 
-  if (TLI.isTypeLegal(InWidenVT)) {
+  if (TLI.isTypeSynthesizable(InWidenVT)) {
     // Because the result and the input are different vector types, widening
     // the result could create a legal type but widening the input might make
     // it an illegal type that might lead to repeatedly splitting the input
@@ -1410,6 +1469,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], WidenNumElts);
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_POWI(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  SDValue ShOp = N->getOperand(1);
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), WidenVT, InOp, ShOp);
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_Shift(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp = GetWidenedVector(N->getOperand(0));
@@ -1501,7 +1567,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BIT_CONVERT(SDNode *N) {
       NewInVT = EVT::getVectorVT(*DAG.getContext(), InVT, NewNumElts);
     }
 
-    if (TLI.isTypeLegal(NewInVT)) {
+    if (TLI.isTypeSynthesizable(NewInVT)) {
       // Because the result and the input are different vector types, widening
       // the result could create a legal type but widening the input might make
       // it an illegal type that might lead to repeatedly splitting the input
@@ -1642,7 +1708,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
                                   SatOp, CvtCode);
   }
 
-  if (TLI.isTypeLegal(InWidenVT)) {
+  if (TLI.isTypeSynthesizable(InWidenVT)) {
     // Because the result and the input are different vector types, widening
     // the result could create a legal type but widening the input might make
     // it an illegal type that might lead to repeatedly splitting the input
@@ -1968,7 +2034,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_BIT_CONVERT(SDNode *N) {
   if (InWidenSize % Size == 0 && !VT.isVector()) {
     unsigned NewNumElts = InWidenSize / Size;
     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), VT, NewNumElts);
-    if (TLI.isTypeLegal(NewVT)) {
+    if (TLI.isTypeSynthesizable(NewVT)) {
       SDValue BitOp = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, InOp);
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, BitOp,
                          DAG.getIntPtrConstant(0));
@@ -2066,7 +2132,7 @@ static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
     unsigned MemVTWidth = MemVT.getSizeInBits();
     if (MemVT.getSizeInBits() <= WidenEltWidth)
       break;
-    if (TLI.isTypeLegal(MemVT) && (WidenWidth % MemVTWidth) == 0 &&
+    if (TLI.isTypeSynthesizable(MemVT) && (WidenWidth % MemVTWidth) == 0 &&
         (MemVTWidth <= Width ||
          (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
       RetVT = MemVT;
@@ -2080,7 +2146,7 @@ static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
        VT >= (unsigned)MVT::FIRST_VECTOR_VALUETYPE; --VT) {
     EVT MemVT = (MVT::SimpleValueType) VT;
     unsigned MemVTWidth = MemVT.getSizeInBits();
-    if (TLI.isTypeLegal(MemVT) && WidenEltVT == MemVT.getVectorElementType() &&
+    if (TLI.isTypeSynthesizable(MemVT) && WidenEltVT == MemVT.getVectorElementType() &&
         (WidenWidth % MemVTWidth) == 0 &&
         (MemVTWidth <= Width ||
          (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
@@ -2286,14 +2352,14 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVector<SDValue, 16>& LdChain,
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
   SmallVector<SDValue, 16> Ops(WidenNumElts);
   unsigned Increment = LdEltVT.getSizeInBits() / 8;
-  Ops[0] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr, SV, SVOffset,
+  Ops[0] = DAG.getExtLoad(ExtType, EltVT, dl, Chain, BasePtr, SV, SVOffset,
                           LdEltVT, isVolatile, isNonTemporal, Align);
   LdChain.push_back(Ops[0].getValue(1));
   unsigned i = 0, Offset = Increment;
   for (i=1; i < NumElts; ++i, Offset += Increment) {
     SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
                                      BasePtr, DAG.getIntPtrConstant(Offset));
-    Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr, SV,
+    Ops[i] = DAG.getExtLoad(ExtType, EltVT, dl, Chain, NewBasePtr, SV,
                             SVOffset + Offset, LdEltVT, isVolatile,
                             isNonTemporal, Align);
     LdChain.push_back(Ops[i].getValue(1));
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index ad8630afff45..3b86c3286585 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -535,7 +535,7 @@ void ScheduleDAGFast::ListScheduleBottomUp() {
         SUnit *LRDef = LiveRegDefs[Reg];
         EVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
         const TargetRegisterClass *RC =
-          TRI->getPhysicalRegisterRegClass(Reg, VT);
+          TRI->getMinimalPhysRegClass(Reg, VT);
         const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC);
 
         // If cross copy register class is null, then it must be possible copy
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 820ba6681606..3ef521c398e1 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -320,7 +320,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
   for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
        I != E; ++I) {
     CapturePred(&*I);
-    if (I->isAssignedRegDep() && SU->getHeight() == LiveRegCycles[I->getReg()]) {
+    if (I->isAssignedRegDep() && SU->getHeight() == LiveRegCycles[I->getReg()]){
       assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
       assert(LiveRegDefs[I->getReg()] == I->getSUnit() &&
              "Physical register dependency violated?");
@@ -795,7 +795,7 @@ void ScheduleDAGRRList::ListScheduleBottomUp() {
         SUnit *LRDef = LiveRegDefs[Reg];
         EVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
         const TargetRegisterClass *RC =
-          TRI->getPhysicalRegisterRegClass(Reg, VT);
+          TRI->getMinimalPhysRegClass(Reg, VT);
         const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC);
 
         // If cross copy register class is null, then it must be possible copy
@@ -1116,7 +1116,7 @@ namespace {
     SUnit *pop() {
       if (empty()) return NULL;
       std::vector<SUnit *>::iterator Best = Queue.begin();
-      for (std::vector<SUnit *>::iterator I = next(Queue.begin()),
+      for (std::vector<SUnit *>::iterator I = llvm::next(Queue.begin()),
            E = Queue.end(); I != E; ++I)
         if (Picker(*Best, *I))
           Best = I;
@@ -1275,6 +1275,17 @@ bool hybrid_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const{
       return left->getHeight() > right->getHeight();
   } else if (RStall)
       return false;
+
+  // If either node is scheduling for latency, sort them by height and latency
+  // first.
+  if (left->SchedulingPref == Sched::Latency ||
+      right->SchedulingPref == Sched::Latency) {
+    if (left->getHeight() != right->getHeight())
+      return left->getHeight() > right->getHeight();
+    if (left->Latency != right->Latency)
+      return left->Latency > right->Latency;
+  }
+
   return BURRSort(left, right, SPQ);
 }
 
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 3185c88b82bf..06cf05308755 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -59,7 +59,11 @@ SUnit *ScheduleDAGSDNodes::NewSUnit(SDNode *N) {
   SUnits.back().OrigNode = &SUnits.back();
   SUnit *SU = &SUnits.back();
   const TargetLowering &TLI = DAG->getTargetLoweringInfo();
-  SU->SchedulingPref = TLI.getSchedulingPreference(N);
+  if (N->isMachineOpcode() &&
+      N->getMachineOpcode() == TargetOpcode::IMPLICIT_DEF)
+    SU->SchedulingPref = Sched::None;
+  else
+    SU->SchedulingPref = TLI.getSchedulingPreference(N);
   return SU;
 }
 
@@ -97,7 +101,7 @@ static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
         II.ImplicitDefs[ResNo - II.getNumDefs()] == Reg) {
       PhysReg = Reg;
       const TargetRegisterClass *RC =
-        TRI->getPhysicalRegisterRegClass(Reg, Def->getValueType(ResNo));
+        TRI->getMinimalPhysRegClass(Reg, Def->getValueType(ResNo));
       Cost = RC->getCopyCost();
     }
   }
@@ -106,17 +110,42 @@ static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
 static void AddFlags(SDNode *N, SDValue Flag, bool AddFlag,
                      SelectionDAG *DAG) {
   SmallVector<EVT, 4> VTs;
-  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i)
-    VTs.push_back(N->getValueType(i));
+  SDNode *FlagDestNode = Flag.getNode();
+
+  // Don't add a flag from a node to itself.
+  if (FlagDestNode == N) return;
+
+  // Don't add a flag to something which already has a flag.
+  if (N->getValueType(N->getNumValues() - 1) == MVT::Flag) return;
+
+  for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
+    VTs.push_back(N->getValueType(I));
+
   if (AddFlag)
     VTs.push_back(MVT::Flag);
+
   SmallVector<SDValue, 4> Ops;
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-    Ops.push_back(N->getOperand(i));
-  if (Flag.getNode())
+  for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I)
+    Ops.push_back(N->getOperand(I));
+
+  if (FlagDestNode)
     Ops.push_back(Flag);
+
   SDVTList VTList = DAG->getVTList(&VTs[0], VTs.size());
+  MachineSDNode::mmo_iterator Begin = 0, End = 0;
+  MachineSDNode *MN = dyn_cast<MachineSDNode>(N);
+
+  // Store memory references.
+  if (MN) {
+    Begin = MN->memoperands_begin();
+    End = MN->memoperands_end();
+  }
+
   DAG->MorphNodeTo(N, N->getOpcode(), VTList, &Ops[0], Ops.size());
+
+  // Reset the memory references
+  if (MN)
+    MN->setMemRefs(Begin, End);
 }
 
 /// ClusterNeighboringLoads - Force nearby loads together by "flagging" them.
@@ -124,98 +153,98 @@ static void AddFlags(SDNode *N, SDValue Flag, bool AddFlag,
 /// offsets are not far apart (target specific), it add MVT::Flag inputs and
 /// outputs to ensure they are scheduled together and in order. This
 /// optimization may benefit some targets by improving cache locality.
-void ScheduleDAGSDNodes::ClusterNeighboringLoads() {
+void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
+  SDNode *Chain = 0;
+  unsigned NumOps = Node->getNumOperands();
+  if (Node->getOperand(NumOps-1).getValueType() == MVT::Other)
+    Chain = Node->getOperand(NumOps-1).getNode();
+  if (!Chain)
+    return;
+
+  // Look for other loads of the same chain. Find loads that are loading from
+  // the same base pointer and different offsets.
   SmallPtrSet<SDNode*, 16> Visited;
   SmallVector<int64_t, 4> Offsets;
   DenseMap<long long, SDNode*> O2SMap;  // Map from offset to SDNode.
-  for (SelectionDAG::allnodes_iterator NI = DAG->allnodes_begin(),
-       E = DAG->allnodes_end(); NI != E; ++NI) {
-    SDNode *Node = &*NI;
-    if (!Node || !Node->isMachineOpcode())
+  bool Cluster = false;
+  SDNode *Base = Node;
+  for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end();
+       I != E; ++I) {
+    SDNode *User = *I;
+    if (User == Node || !Visited.insert(User))
       continue;
-
-    unsigned Opc = Node->getMachineOpcode();
-    const TargetInstrDesc &TID = TII->get(Opc);
-    if (!TID.mayLoad())
+    int64_t Offset1, Offset2;
+    if (!TII->areLoadsFromSameBasePtr(Base, User, Offset1, Offset2) ||
+        Offset1 == Offset2)
+      // FIXME: Should be ok if they addresses are identical. But earlier
+      // optimizations really should have eliminated one of the loads.
       continue;
+    if (O2SMap.insert(std::make_pair(Offset1, Base)).second)
+      Offsets.push_back(Offset1);
+    O2SMap.insert(std::make_pair(Offset2, User));
+    Offsets.push_back(Offset2);
+    if (Offset2 < Offset1)
+      Base = User;
+    Cluster = true;
+  }
 
-    SDNode *Chain = 0;
-    unsigned NumOps = Node->getNumOperands();
-    if (Node->getOperand(NumOps-1).getValueType() == MVT::Other)
-      Chain = Node->getOperand(NumOps-1).getNode();
-    if (!Chain)
-      continue;
+  if (!Cluster)
+    return;
 
-    // Look for other loads of the same chain. Find loads that are loading from
-    // the same base pointer and different offsets.
-    Visited.clear();
-    Offsets.clear();
-    O2SMap.clear();
-    bool Cluster = false;
-    SDNode *Base = Node;
-    int64_t BaseOffset;
-    for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end();
-         I != E; ++I) {
-      SDNode *User = *I;
-      if (User == Node || !Visited.insert(User))
-        continue;
-      int64_t Offset1, Offset2;
-      if (!TII->areLoadsFromSameBasePtr(Base, User, Offset1, Offset2) ||
-          Offset1 == Offset2)
-        // FIXME: Should be ok if they addresses are identical. But earlier
-        // optimizations really should have eliminated one of the loads.
-        continue;
-      if (O2SMap.insert(std::make_pair(Offset1, Base)).second)
-        Offsets.push_back(Offset1);
-      O2SMap.insert(std::make_pair(Offset2, User));
-      Offsets.push_back(Offset2);
-      if (Offset2 < Offset1) {
-        Base = User;
-        BaseOffset = Offset2;
-      } else {
-        BaseOffset = Offset1;
-      }
-      Cluster = true;
-    }
+  // Sort them in increasing order.
+  std::sort(Offsets.begin(), Offsets.end());
+
+  // Check if the loads are close enough.
+  SmallVector<SDNode*, 4> Loads;
+  unsigned NumLoads = 0;
+  int64_t BaseOff = Offsets[0];
+  SDNode *BaseLoad = O2SMap[BaseOff];
+  Loads.push_back(BaseLoad);
+  for (unsigned i = 1, e = Offsets.size(); i != e; ++i) {
+    int64_t Offset = Offsets[i];
+    SDNode *Load = O2SMap[Offset];
+    if (!TII->shouldScheduleLoadsNear(BaseLoad, Load, BaseOff, Offset,NumLoads))
+      break; // Stop right here. Ignore loads that are further away.
+    Loads.push_back(Load);
+    ++NumLoads;
+  }
 
-    if (!Cluster)
-      continue;
+  if (NumLoads == 0)
+    return;
 
-    // Sort them in increasing order.
-    std::sort(Offsets.begin(), Offsets.end());
-
-    // Check if the loads are close enough.
-    SmallVector<SDNode*, 4> Loads;
-    unsigned NumLoads = 0;
-    int64_t BaseOff = Offsets[0];
-    SDNode *BaseLoad = O2SMap[BaseOff];
-    Loads.push_back(BaseLoad);
-    for (unsigned i = 1, e = Offsets.size(); i != e; ++i) {
-      int64_t Offset = Offsets[i];
-      SDNode *Load = O2SMap[Offset];
-      if (!TII->shouldScheduleLoadsNear(BaseLoad, Load, BaseOff, Offset,
-                                        NumLoads))
-        break; // Stop right here. Ignore loads that are further away.
-      Loads.push_back(Load);
-      ++NumLoads;
-    }
+  // Cluster loads by adding MVT::Flag outputs and inputs. This also
+  // ensure they are scheduled in order of increasing addresses.
+  SDNode *Lead = Loads[0];
+  AddFlags(Lead, SDValue(0, 0), true, DAG);
+
+  SDValue InFlag = SDValue(Lead, Lead->getNumValues() - 1);
+  for (unsigned I = 1, E = Loads.size(); I != E; ++I) {
+    bool OutFlag = I < E - 1;
+    SDNode *Load = Loads[I];
+
+    AddFlags(Load, InFlag, OutFlag, DAG);
+
+    if (OutFlag)
+      InFlag = SDValue(Load, Load->getNumValues() - 1);
+
+    ++LoadsClustered;
+  }
+}
 
-    if (NumLoads == 0)
+/// ClusterNodes - Cluster certain nodes which should be scheduled together.
+///
+void ScheduleDAGSDNodes::ClusterNodes() {
+  for (SelectionDAG::allnodes_iterator NI = DAG->allnodes_begin(),
+       E = DAG->allnodes_end(); NI != E; ++NI) {
+    SDNode *Node = &*NI;
+    if (!Node || !Node->isMachineOpcode())
       continue;
 
-    // Cluster loads by adding MVT::Flag outputs and inputs. This also
-    // ensure they are scheduled in order of increasing addresses.
-    SDNode *Lead = Loads[0];
-    AddFlags(Lead, SDValue(0,0), true, DAG);
-    SDValue InFlag = SDValue(Lead, Lead->getNumValues()-1);
-    for (unsigned i = 1, e = Loads.size(); i != e; ++i) {
-      bool OutFlag = i < e-1;
-      SDNode *Load = Loads[i];
-      AddFlags(Load, InFlag, OutFlag, DAG);
-      if (OutFlag)
-        InFlag = SDValue(Load, Load->getNumValues()-1);
-      ++LoadsClustered;
-    }
+    unsigned Opc = Node->getMachineOpcode();
+    const TargetInstrDesc &TID = TII->get(Opc);
+    if (TID.mayLoad())
+      // Cluster loads from "near" addresses into combined SUnits.
+      ClusterNeighboringLoads(Node);
   }
 }
 
@@ -364,8 +393,10 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
         if (Cost >= 0)
           PhysReg = 0;
 
-        const SDep& dep = SDep(OpSU, isChain ? SDep::Order : SDep::Data,
-                               OpSU->Latency, PhysReg);
+        // If this is a ctrl dep, latency is 1.
+        unsigned OpLatency = isChain ? 1 : OpSU->Latency;
+        const SDep &dep = SDep(OpSU, isChain ? SDep::Order : SDep::Data,
+                               OpLatency, PhysReg);
         if (!isChain && !UnitLatencies) {
           ComputeOperandLatency(OpN, N, i, const_cast<SDep &>(dep));
           ST.adjustSchedDependency(OpSU, SU, const_cast<SDep &>(dep));
@@ -382,8 +413,8 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
 /// excludes nodes that aren't interesting to scheduling, and represents
 /// flagged together nodes with a single SUnit.
 void ScheduleDAGSDNodes::BuildSchedGraph(AliasAnalysis *AA) {
-  // Cluster loads from "near" addresses into combined SUnits.
-  ClusterNeighboringLoads();
+  // Cluster certain nodes which should be scheduled together.
+  ClusterNodes();
   // Populate the SUnits array.
   BuildSchedUnits();
   // Compute all the scheduling dependencies between nodes.
@@ -427,15 +458,18 @@ void ScheduleDAGSDNodes::ComputeOperandLatency(SDNode *Def, SDNode *Use,
     return;
 
   unsigned DefIdx = Use->getOperand(OpIdx).getResNo();
-  if (Def->isMachineOpcode() && Use->isMachineOpcode()) {
+  if (Def->isMachineOpcode()) {
     const TargetInstrDesc &II = TII->get(Def->getMachineOpcode());
     if (DefIdx >= II.getNumDefs())
       return;
     int DefCycle = InstrItins.getOperandCycle(II.getSchedClass(), DefIdx);
     if (DefCycle < 0)
       return;
-    const unsigned UseClass = TII->get(Use->getMachineOpcode()).getSchedClass();
-    int UseCycle = InstrItins.getOperandCycle(UseClass, OpIdx);
+    int UseCycle = 1;
+    if (Use->isMachineOpcode()) {
+      const unsigned UseClass = TII->get(Use->getMachineOpcode()).getSchedClass();
+      UseCycle = InstrItins.getOperandCycle(UseClass, OpIdx);
+    }
     if (UseCycle >= 0) {
       int Latency = DefCycle - UseCycle + 1;
       if (Latency >= 0)
@@ -473,7 +507,7 @@ namespace {
 }
 
 // ProcessSourceNode - Process nodes with source order numbers. These are added
-// to a vector which EmitSchedule use to determine how to insert dbg_value
+// to a vector which EmitSchedule uses to determine how to insert dbg_value
 // instructions in the right order.
 static void ProcessSourceNode(SDNode *N, SelectionDAG *DAG,
                            InstrEmitter &Emitter,
@@ -485,13 +519,13 @@ static void ProcessSourceNode(SDNode *N, SelectionDAG *DAG,
     return;
 
   MachineBasicBlock *BB = Emitter.getBlock();
-  if (BB->empty() || BB->back().isPHI()) {
+  if (Emitter.getInsertPos() == BB->begin() || BB->back().isPHI()) {
     // Did not insert any instruction.
     Orders.push_back(std::make_pair(Order, (MachineInstr*)0));
     return;
   }
 
-  Orders.push_back(std::make_pair(Order, &BB->back()));
+  Orders.push_back(std::make_pair(Order, prior(Emitter.getInsertPos())));
   if (!N->getHasDebugValue())
     return;
   // Opportunistically insert immediate dbg_value uses, i.e. those with source
@@ -530,7 +564,7 @@ MachineBasicBlock *ScheduleDAGSDNodes::EmitSchedule() {
     for (; PDI != PDE; ++PDI) {
       MachineInstr *DbgMI= Emitter.EmitDbgValue(*PDI, VRBaseMap);
       if (DbgMI)
-        BB->insert(BB->end(), DbgMI);
+        BB->insert(InsertPos, DbgMI);
     }
   }
 
@@ -574,9 +608,7 @@ MachineBasicBlock *ScheduleDAGSDNodes::EmitSchedule() {
   // Insert all the dbg_values which have not already been inserted in source
   // order sequence.
   if (HasDbg) {
-    MachineBasicBlock::iterator BBBegin = BB->empty() ? BB->end() : BB->begin();
-    while (BBBegin != BB->end() && BBBegin->isPHI())
-      ++BBBegin;
+    MachineBasicBlock::iterator BBBegin = BB->getFirstNonPHI();
 
     // Sort the source order instructions and use the order to insert debug
     // values.
@@ -586,14 +618,12 @@ MachineBasicBlock *ScheduleDAGSDNodes::EmitSchedule() {
     SDDbgInfo::DbgIterator DE = DAG->DbgEnd();
     // Now emit the rest according to source order.
     unsigned LastOrder = 0;
-    MachineInstr *LastMI = 0;
     for (unsigned i = 0, e = Orders.size(); i != e && DI != DE; ++i) {
       unsigned Order = Orders[i].first;
       MachineInstr *MI = Orders[i].second;
       // Insert all SDDbgValue's whose order(s) are before "Order".
       if (!MI)
         continue;
-      MachineBasicBlock *MIBB = MI->getParent();
 #ifndef NDEBUG
       unsigned LastDIOrder = 0;
 #endif
@@ -612,13 +642,14 @@ MachineBasicBlock *ScheduleDAGSDNodes::EmitSchedule() {
             // Insert to start of the BB (after PHIs).
             BB->insert(BBBegin, DbgMI);
           else {
+            // Insert at the instruction, which may be in a different
+            // block, if the block was split by a custom inserter.
             MachineBasicBlock::iterator Pos = MI;
-            MIBB->insert(llvm::next(Pos), DbgMI);
+            MI->getParent()->insert(llvm::next(Pos), DbgMI);
           }
         }
       }
       LastOrder = Order;
-      LastMI = MI;
     }
     // Add trailing DbgValue's before the terminator. FIXME: May want to add
     // some of them before one or more conditional branches?
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index e8714ba83285..842fc8c72703 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -108,7 +108,10 @@ namespace llvm {
   private:
     /// ClusterNeighboringLoads - Cluster loads from "near" addresses into
     /// combined SUnits.
-    void ClusterNeighboringLoads();
+    void ClusterNeighboringLoads(SDNode *Node);
+    /// ClusterNodes - Cluster certain nodes which should be scheduled together.
+    ///
+    void ClusterNodes();
 
     /// BuildSchedUnits, AddSchedEdges - Helper functions for BuildSchedGraph.
     void BuildSchedUnits();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 38bf68b8539f..e83a0346b535 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -790,9 +790,8 @@ unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
 }
 
 // EntryNode could meaningfully have debug info if we can find it...
-SelectionDAG::SelectionDAG(const TargetMachine &tm, FunctionLoweringInfo &fli)
+SelectionDAG::SelectionDAG(const TargetMachine &tm)
   : TM(tm), TLI(*tm.getTargetLowering()), TSI(*tm.getSelectionDAGInfo()),
-    FLI(fli),
     EntryNode(ISD::EntryToken, DebugLoc(), getVTList(MVT::Other)),
     Root(getEntryNode()), Ordering(0) {
   AllNodes.push_back(&EntryNode);
@@ -808,7 +807,6 @@ void SelectionDAG::init(MachineFunction &mf) {
 SelectionDAG::~SelectionDAG() {
   allnodes_clear();
   delete Ordering;
-  DbgInfo->clear();
   delete DbgInfo;
 }
 
@@ -835,11 +833,8 @@ void SelectionDAG::clear() {
   EntryNode.UseList = 0;
   AllNodes.push_back(&EntryNode);
   Root = getEntryNode();
-  delete Ordering;
-  Ordering = new SDNodeOrdering();
+  Ordering->clear();
   DbgInfo->clear();
-  delete DbgInfo;
-  DbgInfo = new SDDbgInfo();
 }
 
 SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, DebugLoc DL, EVT VT) {
@@ -980,7 +975,7 @@ SDValue SelectionDAG::getConstantFP(double Val, EVT VT, bool isTarget) {
   }
 }
 
-SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV,
+SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, DebugLoc DL,
                                        EVT VT, int64_t Offset,
                                        bool isTargetGA,
                                        unsigned char TargetFlags) {
@@ -1015,7 +1010,7 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV,
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) GlobalAddressSDNode(Opc, GV, VT,
+  SDNode *N = new (NodeAllocator) GlobalAddressSDNode(Opc, DL, GV, VT,
                                                       Offset, TargetFlags);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
@@ -2291,7 +2286,6 @@ bool SelectionDAG::isVerifiedDebugInfoDesc(SDValue Op) const {
 SDValue SelectionDAG::getShuffleScalarElt(const ShuffleVectorSDNode *N,
                                           unsigned i) {
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
   if (N->getMaskElt(i) < 0)
     return getUNDEF(VT.getVectorElementType());
   unsigned Index = N->getMaskElt(i);
@@ -2475,9 +2469,18 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
             VT.getVectorNumElements() ==
             Operand.getValueType().getVectorNumElements()) &&
            "Vector element count mismatch!");
-    if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND)
+
+    if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
+        OpOpcode == ISD::ANY_EXTEND)
       // (ext (zext x)) -> (zext x)  and  (ext (sext x)) -> (sext x)
       return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+
+    // (ext (trunx x)) -> x
+    if (OpOpcode == ISD::TRUNCATE) {
+      SDValue OpOp = Operand.getNode()->getOperand(0);
+      if (OpOp.getValueType() == VT)
+        return OpOp;
+    }
     break;
   case ISD::TRUNCATE:
     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
@@ -2622,7 +2625,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     if (N1.getOpcode() == ISD::BUILD_VECTOR &&
         N2.getOpcode() == ISD::BUILD_VECTOR) {
       SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(), N1.getNode()->op_end());
-      Elts.insert(Elts.end(), N2.getNode()->op_begin(), N2.getNode()->op_end());
+      Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end());
       return getNode(ISD::BUILD_VECTOR, DL, VT, &Elts[0], Elts.size());
     }
     break;
@@ -3011,7 +3014,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
                               SDValue N1, SDValue N2, SDValue N3) {
   // Perform various simplifications.
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
-  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
   switch (Opcode) {
   case ISD::CONCAT_VECTORS:
     // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to
@@ -3020,8 +3022,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
         N2.getOpcode() == ISD::BUILD_VECTOR &&
         N3.getOpcode() == ISD::BUILD_VECTOR) {
       SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(), N1.getNode()->op_end());
-      Elts.insert(Elts.end(), N2.getNode()->op_begin(), N2.getNode()->op_end());
-      Elts.insert(Elts.end(), N3.getNode()->op_begin(), N3.getNode()->op_end());
+      Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end());
+      Elts.append(N3.getNode()->op_begin(), N3.getNode()->op_end());
       return getNode(ISD::BUILD_VECTOR, DL, VT, &Elts[0], Elts.size());
     }
     break;
@@ -3041,14 +3043,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
 
     if (N2 == N3) return N2;   // select C, X, X -> X
     break;
-  case ISD::BRCOND:
-    if (N2C) {
-      if (N2C->getZExtValue()) // Unconditional branch
-        return getNode(ISD::BR, DL, MVT::Other, N1, N3);
-      else
-        return N1;         // Never-taken branch
-    }
-    break;
   case ISD::VECTOR_SHUFFLE:
     llvm_unreachable("should use getVectorShuffle constructor!");
     break;
@@ -3267,6 +3261,15 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
     if (VT.bitsGT(LVT))
       VT = LVT;
   }
+  
+  // If we're optimizing for size, and there is a limit, bump the maximum number
+  // of operations inserted down to 4.  This is a wild guess that approximates
+  // the size of a call to memcpy or memset (3 arguments + call).
+  if (Limit != ~0U) {
+    const Function *F = DAG.getMachineFunction().getFunction();
+    if (F->hasFnAttr(Attribute::OptimizeForSize))
+      Limit = 4;
+  }
 
   unsigned NumMemOps = 0;
   while (Size != 0) {
@@ -3321,9 +3324,8 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
   std::string Str;
   bool CopyFromStr = isMemSrcFromString(Src, Str);
   bool isZeroStr = CopyFromStr && Str.empty();
-  uint64_t Limit = -1ULL;
-  if (!AlwaysInline)
-    Limit = TLI.getMaxStoresPerMemcpy();
+  unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy();
+  
   if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
                                 (DstAlignCanChange ? 0 : Align),
                                 (isZeroStr ? 0 : SrcAlign),
@@ -3368,7 +3370,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
       // FIXME does the case above also need this?
       EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
       assert(NVT.bitsGE(VT));
-      Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
+      Value = DAG.getExtLoad(ISD::EXTLOAD, NVT, dl, Chain,
                              getMemBasePlusOffset(Src, SrcOff, DAG),
                              SrcSV, SrcSVOff + SrcOff, VT, isVol, false,
                              MinAlign(SrcAlign, SrcOff));
@@ -3401,9 +3403,6 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
   // below a certain threshold.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   std::vector<EVT> MemOps;
-  uint64_t Limit = -1ULL;
-  if (!AlwaysInline)
-    Limit = TLI.getMaxStoresPerMemmove();
   bool DstAlignCanChange = false;
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
@@ -3412,6 +3411,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
   unsigned SrcAlign = DAG.InferPtrAlignment(Src);
   if (Align > SrcAlign)
     SrcAlign = Align;
+  unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove();
 
   if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
                                 (DstAlignCanChange ? 0 : Align),
@@ -3895,8 +3895,8 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl, SDVTList VTList,
 }
 
 SDValue
-SelectionDAG::getLoad(ISD::MemIndexedMode AM, DebugLoc dl,
-                      ISD::LoadExtType ExtType, EVT VT, SDValue Chain,
+SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
+                      EVT VT, DebugLoc dl, SDValue Chain,
                       SDValue Ptr, SDValue Offset,
                       const Value *SV, int SVOffset, EVT MemVT,
                       bool isVolatile, bool isNonTemporal,
@@ -3919,12 +3919,12 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, DebugLoc dl,
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(SV, Flags, SVOffset,
                             MemVT.getStoreSize(), Alignment);
-  return getLoad(AM, dl, ExtType, VT, Chain, Ptr, Offset, MemVT, MMO);
+  return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, MemVT, MMO);
 }
 
 SDValue
-SelectionDAG::getLoad(ISD::MemIndexedMode AM, DebugLoc dl,
-                      ISD::LoadExtType ExtType, EVT VT, SDValue Chain,
+SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, 
+                      EVT VT, DebugLoc dl, SDValue Chain,
                       SDValue Ptr, SDValue Offset, EVT MemVT,
                       MachineMemOperand *MMO) {
   if (VT == MemVT) {
@@ -3974,18 +3974,18 @@ SDValue SelectionDAG::getLoad(EVT VT, DebugLoc dl,
                               bool isVolatile, bool isNonTemporal,
                               unsigned Alignment) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
-  return getLoad(ISD::UNINDEXED, dl, ISD::NON_EXTLOAD, VT, Chain, Ptr, Undef,
+  return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
                  SV, SVOffset, VT, isVolatile, isNonTemporal, Alignment);
 }
 
-SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, DebugLoc dl, EVT VT,
+SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, EVT VT, DebugLoc dl,
                                  SDValue Chain, SDValue Ptr,
                                  const Value *SV,
                                  int SVOffset, EVT MemVT,
                                  bool isVolatile, bool isNonTemporal,
                                  unsigned Alignment) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
-  return getLoad(ISD::UNINDEXED, dl, ExtType, VT, Chain, Ptr, Undef,
+  return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef,
                  SV, SVOffset, MemVT, isVolatile, isNonTemporal, Alignment);
 }
 
@@ -3995,7 +3995,7 @@ SelectionDAG::getIndexedLoad(SDValue OrigLoad, DebugLoc dl, SDValue Base,
   LoadSDNode *LD = cast<LoadSDNode>(OrigLoad);
   assert(LD->getOffset().getOpcode() == ISD::UNDEF &&
          "Load is already a indexed load!");
-  return getLoad(AM, dl, LD->getExtensionType(), OrigLoad.getValueType(),
+  return getLoad(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl,
                  LD->getChain(), Base, Offset, LD->getSrcValue(),
                  LD->getSrcValueOffset(), LD->getMemoryVT(),
                  LD->isVolatile(), LD->isNonTemporal(), LD->getAlignment());
@@ -4141,9 +4141,10 @@ SelectionDAG::getIndexedStore(SDValue OrigStore, DebugLoc dl, SDValue Base,
 
 SDValue SelectionDAG::getVAArg(EVT VT, DebugLoc dl,
                                SDValue Chain, SDValue Ptr,
-                               SDValue SV) {
-  SDValue Ops[] = { Chain, Ptr, SV };
-  return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops, 3);
+                               SDValue SV,
+                               unsigned Align) {
+  SDValue Ops[] = { Chain, Ptr, SV, getTargetConstant(Align, MVT::i32) };
+  return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops, 4);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
@@ -4425,17 +4426,16 @@ SDVTList SelectionDAG::getVTList(const EVT *VTs, unsigned NumVTs) {
 /// already exists.  If the resultant node does not exist in the DAG, the
 /// input node is returned.  As a degenerate case, if you specify the same
 /// input operands as the node already has, the input node is returned.
-SDValue SelectionDAG::UpdateNodeOperands(SDValue InN, SDValue Op) {
-  SDNode *N = InN.getNode();
+SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op) {
   assert(N->getNumOperands() == 1 && "Update with wrong number of operands");
 
   // Check to see if there is no change.
-  if (Op == N->getOperand(0)) return InN;
+  if (Op == N->getOperand(0)) return N;
 
   // See if the modified node already exists.
   void *InsertPos = 0;
   if (SDNode *Existing = FindModifiedNodeSlot(N, Op, InsertPos))
-    return SDValue(Existing, InN.getResNo());
+    return Existing;
 
   // Nope it doesn't.  Remove the node from its current place in the maps.
   if (InsertPos)
@@ -4447,22 +4447,20 @@ SDValue SelectionDAG::UpdateNodeOperands(SDValue InN, SDValue Op) {
 
   // If this gets put into a CSE map, add it.
   if (InsertPos) CSEMap.InsertNode(N, InsertPos);
-  return InN;
+  return N;
 }
 
-SDValue SelectionDAG::
-UpdateNodeOperands(SDValue InN, SDValue Op1, SDValue Op2) {
-  SDNode *N = InN.getNode();
+SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2) {
   assert(N->getNumOperands() == 2 && "Update with wrong number of operands");
 
   // Check to see if there is no change.
   if (Op1 == N->getOperand(0) && Op2 == N->getOperand(1))
-    return InN;   // No operands changed, just return the input node.
+    return N;   // No operands changed, just return the input node.
 
   // See if the modified node already exists.
   void *InsertPos = 0;
   if (SDNode *Existing = FindModifiedNodeSlot(N, Op1, Op2, InsertPos))
-    return SDValue(Existing, InN.getResNo());
+    return Existing;
 
   // Nope it doesn't.  Remove the node from its current place in the maps.
   if (InsertPos)
@@ -4477,32 +4475,31 @@ UpdateNodeOperands(SDValue InN, SDValue Op1, SDValue Op2) {
 
   // If this gets put into a CSE map, add it.
   if (InsertPos) CSEMap.InsertNode(N, InsertPos);
-  return InN;
+  return N;
 }
 
-SDValue SelectionDAG::
-UpdateNodeOperands(SDValue N, SDValue Op1, SDValue Op2, SDValue Op3) {
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3) {
   SDValue Ops[] = { Op1, Op2, Op3 };
   return UpdateNodeOperands(N, Ops, 3);
 }
 
-SDValue SelectionDAG::
-UpdateNodeOperands(SDValue N, SDValue Op1, SDValue Op2,
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
                    SDValue Op3, SDValue Op4) {
   SDValue Ops[] = { Op1, Op2, Op3, Op4 };
   return UpdateNodeOperands(N, Ops, 4);
 }
 
-SDValue SelectionDAG::
-UpdateNodeOperands(SDValue N, SDValue Op1, SDValue Op2,
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
                    SDValue Op3, SDValue Op4, SDValue Op5) {
   SDValue Ops[] = { Op1, Op2, Op3, Op4, Op5 };
   return UpdateNodeOperands(N, Ops, 5);
 }
 
-SDValue SelectionDAG::
-UpdateNodeOperands(SDValue InN, const SDValue *Ops, unsigned NumOps) {
-  SDNode *N = InN.getNode();
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, const SDValue *Ops, unsigned NumOps) {
   assert(N->getNumOperands() == NumOps &&
          "Update with wrong number of operands");
 
@@ -4516,12 +4513,12 @@ UpdateNodeOperands(SDValue InN, const SDValue *Ops, unsigned NumOps) {
   }
 
   // No operands changed, just return the input node.
-  if (!AnyChange) return InN;
+  if (!AnyChange) return N;
 
   // See if the modified node already exists.
   void *InsertPos = 0;
   if (SDNode *Existing = FindModifiedNodeSlot(N, Ops, NumOps, InsertPos))
-    return SDValue(Existing, InN.getResNo());
+    return Existing;
 
   // Nope it doesn't.  Remove the node from its current place in the maps.
   if (InsertPos)
@@ -4535,7 +4532,7 @@ UpdateNodeOperands(SDValue InN, const SDValue *Ops, unsigned NumOps) {
 
   // If this gets put into a CSE map, add it.
   if (InsertPos) CSEMap.InsertNode(N, InsertPos);
-  return InN;
+  return N;
 }
 
 /// DropOperands - Release the operands and set this node to have
@@ -5378,9 +5375,10 @@ HandleSDNode::~HandleSDNode() {
   DropOperands();
 }
 
-GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, const GlobalValue *GA,
+GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, DebugLoc DL,
+                                         const GlobalValue *GA,
                                          EVT VT, int64_t o, unsigned char TF)
-  : SDNode(Opc, DebugLoc(), getSDVTList(VT)), Offset(o), TargetFlags(TF) {
+  : SDNode(Opc, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) {
   TheGlobal = GA;
 }
 
@@ -5669,13 +5667,16 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FSQRT:  return "fsqrt";
   case ISD::FSIN:   return "fsin";
   case ISD::FCOS:   return "fcos";
-  case ISD::FPOWI:  return "fpowi";
-  case ISD::FPOW:   return "fpow";
   case ISD::FTRUNC: return "ftrunc";
   case ISD::FFLOOR: return "ffloor";
   case ISD::FCEIL:  return "fceil";
   case ISD::FRINT:  return "frint";
   case ISD::FNEARBYINT: return "fnearbyint";
+  case ISD::FEXP:   return "fexp";
+  case ISD::FEXP2:  return "fexp2";
+  case ISD::FLOG:   return "flog";
+  case ISD::FLOG2:  return "flog2";
+  case ISD::FLOG10: return "flog10";
 
   // Binary operators
   case ISD::ADD:    return "add";
@@ -5706,7 +5707,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FREM:   return "frem";
   case ISD::FCOPYSIGN: return "fcopysign";
   case ISD::FGETSIGN:  return "fgetsign";
+  case ISD::FPOW:   return "fpow";
 
+  case ISD::FPOWI:  return "fpowi";
   case ISD::SETCC:       return "setcc";
   case ISD::VSETCC:      return "vsetcc";
   case ISD::SELECT:      return "select";
@@ -6260,23 +6263,6 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
     const MachineFrameInfo &MFI = *getMachineFunction().getFrameInfo();
     unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx),
                                     FrameOffset);
-    if (MFI.isFixedObjectIndex(FrameIdx)) {
-      int64_t ObjectOffset = MFI.getObjectOffset(FrameIdx) + FrameOffset;
-
-      // The alignment of the frame index can be determined from its offset from
-      // the incoming frame position.  If the frame object is at offset 32 and
-      // the stack is guaranteed to be 16-byte aligned, then we know that the
-      // object is 16-byte aligned.
-      unsigned StackAlign = getTarget().getFrameInfo()->getStackAlignment();
-      unsigned Align = MinAlign(ObjectOffset, StackAlign);
-
-      // Finally, the frame object itself may have a known alignment.  Factor
-      // the alignment + offset into a new alignment.  For example, if we know
-      // the FI is 8 byte aligned, but the pointer is 4 off, we really have a
-      // 4-byte alignment of the resultant pointer.  Likewise align 4 + 4-byte
-      // offset = 4-byte alignment, align 4 + 1-byte offset = align 1, etc.
-      return std::max(Align, FIInfoAlign);
-    }
     return FIInfoAlign;
   }
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index fbe601f2db09..d323c163c143 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -14,7 +14,6 @@
 #define DEBUG_TYPE "isel"
 #include "SDNodeDbgValue.h"
 #include "SelectionDAGBuilder.h"
-#include "FunctionLoweringInfo.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -32,6 +31,7 @@
 #include "llvm/Module.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCStrategy.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -70,113 +70,6 @@ LimitFPPrecision("limit-float-precision",
                  cl::location(LimitFloatPrecision),
                  cl::init(0));
 
-namespace {
-  /// RegsForValue - This struct represents the registers (physical or virtual)
-  /// that a particular set of values is assigned, and the type information
-  /// about the value. The most common situation is to represent one value at a
-  /// time, but struct or array values are handled element-wise as multiple
-  /// values.  The splitting of aggregates is performed recursively, so that we
-  /// never have aggregate-typed registers. The values at this point do not
-  /// necessarily have legal types, so each value may require one or more
-  /// registers of some legal type.
-  ///
-  struct RegsForValue {
-    /// TLI - The TargetLowering object.
-    ///
-    const TargetLowering *TLI;
-
-    /// ValueVTs - The value types of the values, which may not be legal, and
-    /// may need be promoted or synthesized from one or more registers.
-    ///
-    SmallVector<EVT, 4> ValueVTs;
-
-    /// RegVTs - The value types of the registers. This is the same size as
-    /// ValueVTs and it records, for each value, what the type of the assigned
-    /// register or registers are. (Individual values are never synthesized
-    /// from more than one type of register.)
-    ///
-    /// With virtual registers, the contents of RegVTs is redundant with TLI's
-    /// getRegisterType member function, however when with physical registers
-    /// it is necessary to have a separate record of the types.
-    ///
-    SmallVector<EVT, 4> RegVTs;
-
-    /// Regs - This list holds the registers assigned to the values.
-    /// Each legal or promoted value requires one register, and each
-    /// expanded value requires multiple registers.
-    ///
-    SmallVector<unsigned, 4> Regs;
-
-    RegsForValue() : TLI(0) {}
-
-    RegsForValue(const TargetLowering &tli,
-                 const SmallVector<unsigned, 4> &regs,
-                 EVT regvt, EVT valuevt)
-      : TLI(&tli),  ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs) {}
-    RegsForValue(const TargetLowering &tli,
-                 const SmallVector<unsigned, 4> &regs,
-                 const SmallVector<EVT, 4> &regvts,
-                 const SmallVector<EVT, 4> &valuevts)
-      : TLI(&tli), ValueVTs(valuevts), RegVTs(regvts), Regs(regs) {}
-    RegsForValue(LLVMContext &Context, const TargetLowering &tli,
-                 unsigned Reg, const Type *Ty) : TLI(&tli) {
-      ComputeValueVTs(tli, Ty, ValueVTs);
-
-      for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
-        EVT ValueVT = ValueVTs[Value];
-        unsigned NumRegs = TLI->getNumRegisters(Context, ValueVT);
-        EVT RegisterVT = TLI->getRegisterType(Context, ValueVT);
-        for (unsigned i = 0; i != NumRegs; ++i)
-          Regs.push_back(Reg + i);
-        RegVTs.push_back(RegisterVT);
-        Reg += NumRegs;
-      }
-    }
-
-    /// areValueTypesLegal - Return true if types of all the values are legal.
-    bool areValueTypesLegal() {
-      for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
-        EVT RegisterVT = RegVTs[Value];
-        if (!TLI->isTypeLegal(RegisterVT))
-          return false;
-      }
-      return true;
-    }
-
-
-    /// append - Add the specified values to this one.
-    void append(const RegsForValue &RHS) {
-      TLI = RHS.TLI;
-      ValueVTs.append(RHS.ValueVTs.begin(), RHS.ValueVTs.end());
-      RegVTs.append(RHS.RegVTs.begin(), RHS.RegVTs.end());
-      Regs.append(RHS.Regs.begin(), RHS.Regs.end());
-    }
-
-
-    /// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
-    /// this value and returns the result as a ValueVTs value.  This uses
-    /// Chain/Flag as the input and updates them for the output Chain/Flag.
-    /// If the Flag pointer is NULL, no flag is used.
-    SDValue getCopyFromRegs(SelectionDAG &DAG, DebugLoc dl,
-                            SDValue &Chain, SDValue *Flag) const;
-
-    /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
-    /// specified value into the registers specified by this object.  This uses
-    /// Chain/Flag as the input and updates them for the output Chain/Flag.
-    /// If the Flag pointer is NULL, no flag is used.
-    void getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
-                       SDValue &Chain, SDValue *Flag) const;
-
-    /// AddInlineAsmOperands - Add this value to the specified inlineasm node
-    /// operand list.  This adds the code marker, matching input operand index
-    /// (if applicable), and includes the number of values added into it.
-    void AddInlineAsmOperands(unsigned Kind,
-                              bool HasMatching, unsigned MatchingIdx,
-                              SelectionDAG &DAG,
-                              std::vector<SDValue> &Ops) const;
-  };
-}
-
 /// getCopyFromParts - Create a value that contains the specified legal parts
 /// combined into the value they represent.  If the parts combine to a type
 /// larger then ValueVT then AssertOp can be used to specify whether the extra
@@ -528,6 +421,268 @@ static void getCopyToParts(SelectionDAG &DAG, DebugLoc dl,
   }
 }
 
+namespace {
+  /// RegsForValue - This struct represents the registers (physical or virtual)
+  /// that a particular set of values is assigned, and the type information
+  /// about the value. The most common situation is to represent one value at a
+  /// time, but struct or array values are handled element-wise as multiple
+  /// values.  The splitting of aggregates is performed recursively, so that we
+  /// never have aggregate-typed registers. The values at this point do not
+  /// necessarily have legal types, so each value may require one or more
+  /// registers of some legal type.
+  ///
+  struct RegsForValue {
+    /// ValueVTs - The value types of the values, which may not be legal, and
+    /// may need be promoted or synthesized from one or more registers.
+    ///
+    SmallVector<EVT, 4> ValueVTs;
+
+    /// RegVTs - The value types of the registers. This is the same size as
+    /// ValueVTs and it records, for each value, what the type of the assigned
+    /// register or registers are. (Individual values are never synthesized
+    /// from more than one type of register.)
+    ///
+    /// With virtual registers, the contents of RegVTs is redundant with TLI's
+    /// getRegisterType member function, however when with physical registers
+    /// it is necessary to have a separate record of the types.
+    ///
+    SmallVector<EVT, 4> RegVTs;
+
+    /// Regs - This list holds the registers assigned to the values.
+    /// Each legal or promoted value requires one register, and each
+    /// expanded value requires multiple registers.
+    ///
+    SmallVector<unsigned, 4> Regs;
+
+    RegsForValue() {}
+
+    RegsForValue(const SmallVector<unsigned, 4> &regs,
+                 EVT regvt, EVT valuevt)
+      : ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs) {}
+
+    RegsForValue(const SmallVector<unsigned, 4> &regs,
+                 const SmallVector<EVT, 4> &regvts,
+                 const SmallVector<EVT, 4> &valuevts)
+      : ValueVTs(valuevts), RegVTs(regvts), Regs(regs) {}
+
+    RegsForValue(LLVMContext &Context, const TargetLowering &tli,
+                 unsigned Reg, const Type *Ty) {
+      ComputeValueVTs(tli, Ty, ValueVTs);
+
+      for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
+        EVT ValueVT = ValueVTs[Value];
+        unsigned NumRegs = tli.getNumRegisters(Context, ValueVT);
+        EVT RegisterVT = tli.getRegisterType(Context, ValueVT);
+        for (unsigned i = 0; i != NumRegs; ++i)
+          Regs.push_back(Reg + i);
+        RegVTs.push_back(RegisterVT);
+        Reg += NumRegs;
+      }
+    }
+
+    /// areValueTypesLegal - Return true if types of all the values are legal.
+    bool areValueTypesLegal(const TargetLowering &TLI) {
+      for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
+        EVT RegisterVT = RegVTs[Value];
+        if (!TLI.isTypeLegal(RegisterVT))
+          return false;
+      }
+      return true;
+    }
+
+    /// append - Add the specified values to this one.
+    void append(const RegsForValue &RHS) {
+      ValueVTs.append(RHS.ValueVTs.begin(), RHS.ValueVTs.end());
+      RegVTs.append(RHS.RegVTs.begin(), RHS.RegVTs.end());
+      Regs.append(RHS.Regs.begin(), RHS.Regs.end());
+    }
+
+    /// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
+    /// this value and returns the result as a ValueVTs value.  This uses
+    /// Chain/Flag as the input and updates them for the output Chain/Flag.
+    /// If the Flag pointer is NULL, no flag is used.
+    SDValue getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo,
+                            DebugLoc dl,
+                            SDValue &Chain, SDValue *Flag) const;
+
+    /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
+    /// specified value into the registers specified by this object.  This uses
+    /// Chain/Flag as the input and updates them for the output Chain/Flag.
+    /// If the Flag pointer is NULL, no flag is used.
+    void getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
+                       SDValue &Chain, SDValue *Flag) const;
+
+    /// AddInlineAsmOperands - Add this value to the specified inlineasm node
+    /// operand list.  This adds the code marker, matching input operand index
+    /// (if applicable), and includes the number of values added into it.
+    void AddInlineAsmOperands(unsigned Kind,
+                              bool HasMatching, unsigned MatchingIdx,
+                              SelectionDAG &DAG,
+                              std::vector<SDValue> &Ops) const;
+  };
+}
+
+/// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
+/// this value and returns the result as a ValueVT value.  This uses
+/// Chain/Flag as the input and updates them for the output Chain/Flag.
+/// If the Flag pointer is NULL, no flag is used.
+SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
+                                      FunctionLoweringInfo &FuncInfo,
+                                      DebugLoc dl,
+                                      SDValue &Chain, SDValue *Flag) const {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Assemble the legal parts into the final values.
+  SmallVector<SDValue, 4> Values(ValueVTs.size());
+  SmallVector<SDValue, 8> Parts;
+  for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    // Copy the legal parts from the registers.
+    EVT ValueVT = ValueVTs[Value];
+    unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVT);
+    EVT RegisterVT = RegVTs[Value];
+
+    Parts.resize(NumRegs);
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      SDValue P;
+      if (Flag == 0) {
+        P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT);
+      } else {
+        P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT, *Flag);
+        *Flag = P.getValue(2);
+      }
+
+      Chain = P.getValue(1);
+
+      // If the source register was virtual and if we know something about it,
+      // add an assert node.
+      if (TargetRegisterInfo::isVirtualRegister(Regs[Part+i]) &&
+          RegisterVT.isInteger() && !RegisterVT.isVector()) {
+        unsigned SlotNo = Regs[Part+i]-TargetRegisterInfo::FirstVirtualRegister;
+        if (FuncInfo.LiveOutRegInfo.size() > SlotNo) {
+          const FunctionLoweringInfo::LiveOutInfo &LOI =
+            FuncInfo.LiveOutRegInfo[SlotNo];
+
+          unsigned RegSize = RegisterVT.getSizeInBits();
+          unsigned NumSignBits = LOI.NumSignBits;
+          unsigned NumZeroBits = LOI.KnownZero.countLeadingOnes();
+
+          // FIXME: We capture more information than the dag can represent.  For
+          // now, just use the tightest assertzext/assertsext possible.
+          bool isSExt = true;
+          EVT FromVT(MVT::Other);
+          if (NumSignBits == RegSize)
+            isSExt = true, FromVT = MVT::i1;   // ASSERT SEXT 1
+          else if (NumZeroBits >= RegSize-1)
+            isSExt = false, FromVT = MVT::i1;  // ASSERT ZEXT 1
+          else if (NumSignBits > RegSize-8)
+            isSExt = true, FromVT = MVT::i8;   // ASSERT SEXT 8
+          else if (NumZeroBits >= RegSize-8)
+            isSExt = false, FromVT = MVT::i8;  // ASSERT ZEXT 8
+          else if (NumSignBits > RegSize-16)
+            isSExt = true, FromVT = MVT::i16;  // ASSERT SEXT 16
+          else if (NumZeroBits >= RegSize-16)
+            isSExt = false, FromVT = MVT::i16; // ASSERT ZEXT 16
+          else if (NumSignBits > RegSize-32)
+            isSExt = true, FromVT = MVT::i32;  // ASSERT SEXT 32
+          else if (NumZeroBits >= RegSize-32)
+            isSExt = false, FromVT = MVT::i32; // ASSERT ZEXT 32
+
+          if (FromVT != MVT::Other)
+            P = DAG.getNode(isSExt ? ISD::AssertSext : ISD::AssertZext, dl,
+                            RegisterVT, P, DAG.getValueType(FromVT));
+        }
+      }
+
+      Parts[i] = P;
+    }
+
+    Values[Value] = getCopyFromParts(DAG, dl, Parts.begin(),
+                                     NumRegs, RegisterVT, ValueVT);
+    Part += NumRegs;
+    Parts.clear();
+  }
+
+  return DAG.getNode(ISD::MERGE_VALUES, dl,
+                     DAG.getVTList(&ValueVTs[0], ValueVTs.size()),
+                     &Values[0], ValueVTs.size());
+}
+
+/// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
+/// specified value into the registers specified by this object.  This uses
+/// Chain/Flag as the input and updates them for the output Chain/Flag.
+/// If the Flag pointer is NULL, no flag is used.
+void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
+                                 SDValue &Chain, SDValue *Flag) const {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Get the list of the values's legal parts.
+  unsigned NumRegs = Regs.size();
+  SmallVector<SDValue, 8> Parts(NumRegs);
+  for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    EVT ValueVT = ValueVTs[Value];
+    unsigned NumParts = TLI.getNumRegisters(*DAG.getContext(), ValueVT);
+    EVT RegisterVT = RegVTs[Value];
+
+    getCopyToParts(DAG, dl,
+                   Val.getValue(Val.getResNo() + Value),
+                   &Parts[Part], NumParts, RegisterVT);
+    Part += NumParts;
+  }
+
+  // Copy the parts into the registers.
+  SmallVector<SDValue, 8> Chains(NumRegs);
+  for (unsigned i = 0; i != NumRegs; ++i) {
+    SDValue Part;
+    if (Flag == 0) {
+      Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i]);
+    } else {
+      Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i], *Flag);
+      *Flag = Part.getValue(1);
+    }
+
+    Chains[i] = Part.getValue(0);
+  }
+
+  if (NumRegs == 1 || Flag)
+    // If NumRegs > 1 && Flag is used then the use of the last CopyToReg is
+    // flagged to it. That is the CopyToReg nodes and the user are considered
+    // a single scheduling unit. If we create a TokenFactor and return it as
+    // chain, then the TokenFactor is both a predecessor (operand) of the
+    // user as well as a successor (the TF operands are flagged to the user).
+    // c1, f1 = CopyToReg
+    // c2, f2 = CopyToReg
+    // c3     = TokenFactor c1, c2
+    // ...
+    //        = op c3, ..., f2
+    Chain = Chains[NumRegs-1];
+  else
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], NumRegs);
+}
+
+/// AddInlineAsmOperands - Add this value to the specified inlineasm node
+/// operand list.  This adds the code marker and includes the number of
+/// values added into it.
+void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
+                                        unsigned MatchingIdx,
+                                        SelectionDAG &DAG,
+                                        std::vector<SDValue> &Ops) const {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  unsigned Flag = InlineAsm::getFlagWord(Code, Regs.size());
+  if (HasMatching)
+    Flag = InlineAsm::getFlagWordForMatchingOp(Flag, MatchingIdx);
+  SDValue Res = DAG.getTargetConstant(Flag, MVT::i32);
+  Ops.push_back(Res);
+
+  for (unsigned Value = 0, Reg = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVTs[Value]);
+    EVT RegisterVT = RegVTs[Value];
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      assert(Reg < Regs.size() && "Mismatch in # registers expected");
+      Ops.push_back(DAG.getRegister(Regs[Reg++], RegisterVT));
+    }
+  }
+}
 
 void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa) {
   AA = &aa;
@@ -543,6 +698,7 @@ void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa) {
 /// consumed.
 void SelectionDAGBuilder::clear() {
   NodeMap.clear();
+  UnusedArgNodeMap.clear();
   PendingLoads.clear();
   PendingExports.clear();
   CurDebugLoc = DebugLoc();
@@ -649,27 +805,63 @@ void SelectionDAGBuilder::visit(unsigned Opcode, const User &I) {
   }
 }
 
+// getValue - Return an SDValue for the given Value.
 SDValue SelectionDAGBuilder::getValue(const Value *V) {
+  // If we already have an SDValue for this value, use it. It's important
+  // to do this first, so that we don't create a CopyFromReg if we already
+  // have a regular SDValue.
   SDValue &N = NodeMap[V];
   if (N.getNode()) return N;
 
+  // If there's a virtual register allocated and initialized for this
+  // value, use it.
+  DenseMap<const Value *, unsigned>::iterator It = FuncInfo.ValueMap.find(V);
+  if (It != FuncInfo.ValueMap.end()) {
+    unsigned InReg = It->second;
+    RegsForValue RFV(*DAG.getContext(), TLI, InReg, V->getType());
+    SDValue Chain = DAG.getEntryNode();
+    return N = RFV.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(), Chain, NULL);
+  }
+
+  // Otherwise create a new SDValue and remember it.
+  SDValue Val = getValueImpl(V);
+  NodeMap[V] = Val;
+  return Val;
+}
+
+/// getNonRegisterValue - Return an SDValue for the given Value, but
+/// don't look in FuncInfo.ValueMap for a virtual register.
+SDValue SelectionDAGBuilder::getNonRegisterValue(const Value *V) {
+  // If we already have an SDValue for this value, use it.
+  SDValue &N = NodeMap[V];
+  if (N.getNode()) return N;
+
+  // Otherwise create a new SDValue and remember it.
+  SDValue Val = getValueImpl(V);
+  NodeMap[V] = Val;
+  return Val;
+}
+
+/// getValueImpl - Helper function for getValue and getMaterializedValue.
+/// Create an SDValue for the given value.
+SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
   if (const Constant *C = dyn_cast<Constant>(V)) {
     EVT VT = TLI.getValueType(V->getType(), true);
 
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
-      return N = DAG.getConstant(*CI, VT);
+      return DAG.getConstant(*CI, VT);
 
     if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
-      return N = DAG.getGlobalAddress(GV, VT);
+      return DAG.getGlobalAddress(GV, getCurDebugLoc(), VT);
 
     if (isa<ConstantPointerNull>(C))
-      return N = DAG.getConstant(0, TLI.getPointerTy());
+      return DAG.getConstant(0, TLI.getPointerTy());
 
     if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
-      return N = DAG.getConstantFP(*CFP, VT);
+      return DAG.getConstantFP(*CFP, VT);
 
     if (isa<UndefValue>(C) && !V->getType()->isAggregateType())
-      return N = DAG.getUNDEF(VT);
+      return DAG.getUNDEF(VT);
 
     if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
       visit(CE->getOpcode(), *CE);
@@ -757,82 +949,25 @@ SDValue SelectionDAGBuilder::getValue(const Value *V) {
       return DAG.getFrameIndex(SI->second, TLI.getPointerTy());
   }
 
-  unsigned InReg = FuncInfo.ValueMap[V];
-  assert(InReg && "Value not in map!");
-
-  RegsForValue RFV(*DAG.getContext(), TLI, InReg, V->getType());
-  SDValue Chain = DAG.getEntryNode();
-  return RFV.getCopyFromRegs(DAG, getCurDebugLoc(), Chain, NULL);
-}
-
-/// Get the EVTs and ArgFlags collections that represent the legalized return 
-/// type of the given function.  This does not require a DAG or a return value,
-/// and is suitable for use before any DAGs for the function are constructed.
-static void getReturnInfo(const Type* ReturnType,
-                   Attributes attr, SmallVectorImpl<EVT> &OutVTs,
-                   SmallVectorImpl<ISD::ArgFlagsTy> &OutFlags,
-                   const TargetLowering &TLI,
-                   SmallVectorImpl<uint64_t> *Offsets = 0) {
-  SmallVector<EVT, 4> ValueVTs;
-  ComputeValueVTs(TLI, ReturnType, ValueVTs);
-  unsigned NumValues = ValueVTs.size();
-  if (NumValues == 0) return;
-  unsigned Offset = 0;
-
-  for (unsigned j = 0, f = NumValues; j != f; ++j) {
-    EVT VT = ValueVTs[j];
-    ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
-
-    if (attr & Attribute::SExt)
-      ExtendKind = ISD::SIGN_EXTEND;
-    else if (attr & Attribute::ZExt)
-      ExtendKind = ISD::ZERO_EXTEND;
-
-    // FIXME: C calling convention requires the return type to be promoted to
-    // at least 32-bit. But this is not necessary for non-C calling
-    // conventions. The frontend should mark functions whose return values
-    // require promoting with signext or zeroext attributes.
-    if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger()) {
-      EVT MinVT = TLI.getRegisterType(ReturnType->getContext(), MVT::i32);
-      if (VT.bitsLT(MinVT))
-        VT = MinVT;
-    }
-
-    unsigned NumParts = TLI.getNumRegisters(ReturnType->getContext(), VT);
-    EVT PartVT = TLI.getRegisterType(ReturnType->getContext(), VT);
-    unsigned PartSize = TLI.getTargetData()->getTypeAllocSize(
-                        PartVT.getTypeForEVT(ReturnType->getContext()));
-
-    // 'inreg' on function refers to return value
-    ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
-    if (attr & Attribute::InReg)
-      Flags.setInReg();
-
-    // Propagate extension type if any
-    if (attr & Attribute::SExt)
-      Flags.setSExt();
-    else if (attr & Attribute::ZExt)
-      Flags.setZExt();
-
-    for (unsigned i = 0; i < NumParts; ++i) {
-      OutVTs.push_back(PartVT);
-      OutFlags.push_back(Flags);
-      if (Offsets)
-      {
-        Offsets->push_back(Offset);
-        Offset += PartSize;
-      }
-    }
+  // If this is an instruction which fast-isel has deferred, select it now.
+  if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
+    unsigned InReg = FuncInfo.InitializeRegForValue(Inst);
+    RegsForValue RFV(*DAG.getContext(), TLI, InReg, Inst->getType());
+    SDValue Chain = DAG.getEntryNode();
+    return RFV.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(), Chain, NULL);
   }
+
+  llvm_unreachable("Can't get register for value!");
+  return SDValue();
 }
 
 void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
   SDValue Chain = getControlRoot();
   SmallVector<ISD::OutputArg, 8> Outs;
-  FunctionLoweringInfo &FLI = DAG.getFunctionLoweringInfo();
+  SmallVector<SDValue, 8> OutVals;
 
-  if (!FLI.CanLowerReturn) {
-    unsigned DemoteReg = FLI.DemoteRegister;
+  if (!FuncInfo.CanLowerReturn) {
+    unsigned DemoteReg = FuncInfo.DemoteRegister;
     const Function *F = I.getParent()->getParent();
 
     // Emit a store of the return value through the virtual register.
@@ -908,8 +1043,11 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
         else if (F->paramHasAttr(0, Attribute::ZExt))
           Flags.setZExt();
 
-        for (unsigned i = 0; i < NumParts; ++i)
-          Outs.push_back(ISD::OutputArg(Flags, Parts[i], /*isfixed=*/true));
+        for (unsigned i = 0; i < NumParts; ++i) {
+          Outs.push_back(ISD::OutputArg(Flags, Parts[i].getValueType(),
+                                        /*isfixed=*/true));
+          OutVals.push_back(Parts[i]);
+        }
       }
     }
   }
@@ -918,7 +1056,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
   CallingConv::ID CallConv =
     DAG.getMachineFunction().getFunction()->getCallingConv();
   Chain = TLI.LowerReturn(Chain, CallConv, isVarArg,
-                          Outs, getCurDebugLoc(), DAG);
+                          Outs, OutVals, getCurDebugLoc(), DAG);
 
   // Verify that the target's LowerReturn behaved as expected.
   assert(Chain.getNode() && Chain.getValueType() == MVT::Other &&
@@ -1119,7 +1257,7 @@ SelectionDAGBuilder::ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases){
 }
 
 void SelectionDAGBuilder::visitBr(const BranchInst &I) {
-  MachineBasicBlock *BrMBB = FuncInfo.MBBMap[I.getParent()];
+  MachineBasicBlock *BrMBB = FuncInfo.MBB;
 
   // Update machine-CFG edges.
   MachineBasicBlock *Succ0MBB = FuncInfo.MBBMap[I.getSuccessor(0)];
@@ -1269,18 +1407,10 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
                                MVT::Other, getControlRoot(), Cond,
                                DAG.getBasicBlock(CB.TrueBB));
 
-  // If the branch was constant folded, fix up the CFG.
-  if (BrCond.getOpcode() == ISD::BR) {
-    SwitchBB->removeSuccessor(CB.FalseBB);
-  } else {
-    // Otherwise, go ahead and insert the false branch.
-    if (BrCond == getControlRoot())
-      SwitchBB->removeSuccessor(CB.TrueBB);
-
-    if (CB.FalseBB != NextBlock)
-      BrCond = DAG.getNode(ISD::BR, dl, MVT::Other, BrCond,
-                           DAG.getBasicBlock(CB.FalseBB));
-  }
+  // Insert the false branch.
+  if (CB.FalseBB != NextBlock)
+    BrCond = DAG.getNode(ISD::BR, dl, MVT::Other, BrCond,
+                         DAG.getBasicBlock(CB.FalseBB));
 
   DAG.setRoot(BrCond);
 }
@@ -1319,7 +1449,7 @@ void SelectionDAGBuilder::visitJumpTableHeader(JumpTable &JT,
   // therefore require extension or truncating.
   SwitchOp = DAG.getZExtOrTrunc(Sub, getCurDebugLoc(), TLI.getPointerTy());
 
-  unsigned JumpTableReg = FuncInfo.MakeReg(TLI.getPointerTy());
+  unsigned JumpTableReg = FuncInfo.CreateReg(TLI.getPointerTy());
   SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), getCurDebugLoc(),
                                     JumpTableReg, SwitchOp);
   JT.Reg = JumpTableReg;
@@ -1370,7 +1500,7 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
   SDValue ShiftOp = DAG.getZExtOrTrunc(Sub, getCurDebugLoc(),
                                        TLI.getPointerTy());
 
-  B.Reg = FuncInfo.MakeReg(TLI.getPointerTy());
+  B.Reg = FuncInfo.CreateReg(TLI.getPointerTy());
   SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), getCurDebugLoc(),
                                     B.Reg, ShiftOp);
 
@@ -1402,29 +1532,41 @@ void SelectionDAGBuilder::visitBitTestCase(MachineBasicBlock* NextMBB,
                                            unsigned Reg,
                                            BitTestCase &B,
                                            MachineBasicBlock *SwitchBB) {
-  // Make desired shift
   SDValue ShiftOp = DAG.getCopyFromReg(getControlRoot(), getCurDebugLoc(), Reg,
                                        TLI.getPointerTy());
-  SDValue SwitchVal = DAG.getNode(ISD::SHL, getCurDebugLoc(),
-                                  TLI.getPointerTy(),
-                                  DAG.getConstant(1, TLI.getPointerTy()),
-                                  ShiftOp);
-
-  // Emit bit tests and jumps
-  SDValue AndOp = DAG.getNode(ISD::AND, getCurDebugLoc(),
-                              TLI.getPointerTy(), SwitchVal,
-                              DAG.getConstant(B.Mask, TLI.getPointerTy()));
-  SDValue AndCmp = DAG.getSetCC(getCurDebugLoc(),
-                                TLI.getSetCCResultType(AndOp.getValueType()),
-                                AndOp, DAG.getConstant(0, TLI.getPointerTy()),
-                                ISD::SETNE);
+  SDValue Cmp;
+  if (CountPopulation_64(B.Mask) == 1) {
+    // Testing for a single bit; just compare the shift count with what it
+    // would need to be to shift a 1 bit in that position.
+    Cmp = DAG.getSetCC(getCurDebugLoc(),
+                       TLI.getSetCCResultType(ShiftOp.getValueType()),
+                       ShiftOp,
+                       DAG.getConstant(CountTrailingZeros_64(B.Mask),
+                                       TLI.getPointerTy()),
+                       ISD::SETEQ);
+  } else {
+    // Make desired shift
+    SDValue SwitchVal = DAG.getNode(ISD::SHL, getCurDebugLoc(),
+                                    TLI.getPointerTy(),
+                                    DAG.getConstant(1, TLI.getPointerTy()),
+                                    ShiftOp);
+
+    // Emit bit tests and jumps
+    SDValue AndOp = DAG.getNode(ISD::AND, getCurDebugLoc(),
+                                TLI.getPointerTy(), SwitchVal,
+                                DAG.getConstant(B.Mask, TLI.getPointerTy()));
+    Cmp = DAG.getSetCC(getCurDebugLoc(),
+                       TLI.getSetCCResultType(AndOp.getValueType()),
+                       AndOp, DAG.getConstant(0, TLI.getPointerTy()),
+                       ISD::SETNE);
+  }
 
   SwitchBB->addSuccessor(B.TargetBB);
   SwitchBB->addSuccessor(NextMBB);
 
   SDValue BrAnd = DAG.getNode(ISD::BRCOND, getCurDebugLoc(),
                               MVT::Other, getControlRoot(),
-                              AndCmp, DAG.getBasicBlock(B.TargetBB));
+                              Cmp, DAG.getBasicBlock(B.TargetBB));
 
   // Set NextBlock to be the MBB immediately after the current one, if any.
   // This is used to avoid emitting unnecessary branches to the next block.
@@ -1441,7 +1583,7 @@ void SelectionDAGBuilder::visitBitTestCase(MachineBasicBlock* NextMBB,
 }
 
 void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
-  MachineBasicBlock *InvokeMBB = FuncInfo.MBBMap[I.getParent()];
+  MachineBasicBlock *InvokeMBB = FuncInfo.MBB;
 
   // Retrieve successors.
   MachineBasicBlock *Return = FuncInfo.MBBMap[I.getSuccessor(0)];
@@ -1969,7 +2111,7 @@ size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases,
 }
 
 void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
-  MachineBasicBlock *SwitchMBB = FuncInfo.MBBMap[SI.getParent()];
+  MachineBasicBlock *SwitchMBB = FuncInfo.MBB;
 
   // Figure out which block is immediately after the current one.
   MachineBasicBlock *NextBlock = 0;
@@ -2035,7 +2177,7 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
 }
 
 void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
-  MachineBasicBlock *IndirectBrMBB = FuncInfo.MBBMap[I.getParent()];
+  MachineBasicBlock *IndirectBrMBB = FuncInfo.MBB;
 
   // Update machine-CFG edges with unique successors.
   SmallVector<BasicBlock*, 32> succs;
@@ -2245,7 +2387,6 @@ void SelectionDAGBuilder::visitPtrToInt(const User &I) {
   // What to do depends on the size of the integer and the size of the pointer.
   // We can either truncate, zero extend, or no-op, accordingly.
   SDValue N = getValue(I.getOperand(0));
-  EVT SrcVT = N.getValueType();
   EVT DestVT = TLI.getValueType(I.getType());
   setValue(&I, DAG.getZExtOrTrunc(N, getCurDebugLoc(), DestVT));
 }
@@ -2254,7 +2395,6 @@ void SelectionDAGBuilder::visitIntToPtr(const User &I) {
   // What to do depends on the size of the integer and the size of the pointer.
   // We can either truncate, zero extend, or no-op, accordingly.
   SDValue N = getValue(I.getOperand(0));
-  EVT SrcVT = N.getValueType();
   EVT DestVT = TLI.getValueType(I.getType());
   setValue(&I, DAG.getZExtOrTrunc(N, getCurDebugLoc(), DestVT));
 }
@@ -2579,7 +2719,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
 
       // If this is a constant subscript, handle it quickly.
       if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
-        if (CI->getZExtValue() == 0) continue;
+        if (CI->isZero()) continue;
         uint64_t Offs =
             TD->getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
         SDValue OffsVal;
@@ -2643,12 +2783,13 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
 
   SDValue AllocSize = getValue(I.getArraySize());
 
-  AllocSize = DAG.getNode(ISD::MUL, getCurDebugLoc(), AllocSize.getValueType(),
-                          AllocSize,
-                          DAG.getConstant(TySize, AllocSize.getValueType()));
-
   EVT IntPtr = TLI.getPointerTy();
-  AllocSize = DAG.getZExtOrTrunc(AllocSize, getCurDebugLoc(), IntPtr);
+  if (AllocSize.getValueType() != IntPtr)
+    AllocSize = DAG.getZExtOrTrunc(AllocSize, getCurDebugLoc(), IntPtr);
+
+  AllocSize = DAG.getNode(ISD::MUL, getCurDebugLoc(), IntPtr,
+                          AllocSize,
+                          DAG.getConstant(TySize, IntPtr));
 
   // Handle alignment.  If the requested alignment is less than or equal to
   // the stack alignment, ignore it.  If the size is greater than or equal to
@@ -2804,8 +2945,8 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
     Ops.push_back(DAG.getConstant(Intrinsic, TLI.getPointerTy()));
 
   // Add all operands of the call to the operand list.
-  for (unsigned i = 1, e = I.getNumOperands(); i != e; ++i) {
-    SDValue Op = getValue(I.getOperand(i));
+  for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
+    SDValue Op = getValue(I.getArgOperand(i));
     assert(TLI.isTypeLegal(Op.getValueType()) &&
            "Intrinsic uses a non-legal type?");
     Ops.push_back(Op);
@@ -2910,11 +3051,11 @@ SelectionDAGBuilder::implVisitBinaryAtomic(const CallInst& I,
   SDValue Root = getRoot();
   SDValue L =
     DAG.getAtomic(Op, getCurDebugLoc(),
-                  getValue(I.getOperand(2)).getValueType().getSimpleVT(),
+                  getValue(I.getArgOperand(1)).getValueType().getSimpleVT(),
                   Root,
-                  getValue(I.getOperand(1)),
-                  getValue(I.getOperand(2)),
-                  I.getOperand(1));
+                  getValue(I.getArgOperand(0)),
+                  getValue(I.getArgOperand(1)),
+                  I.getArgOperand(0));
   setValue(&I, L);
   DAG.setRoot(L.getValue(1));
   return 0;
@@ -2923,8 +3064,8 @@ SelectionDAGBuilder::implVisitBinaryAtomic(const CallInst& I,
 // implVisitAluOverflow - Lower arithmetic overflow instrinsics.
 const char *
 SelectionDAGBuilder::implVisitAluOverflow(const CallInst &I, ISD::NodeType Op) {
-  SDValue Op1 = getValue(I.getOperand(1));
-  SDValue Op2 = getValue(I.getOperand(2));
+  SDValue Op1 = getValue(I.getArgOperand(0));
+  SDValue Op2 = getValue(I.getArgOperand(1));
 
   SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i1);
   setValue(&I, DAG.getNode(Op, getCurDebugLoc(), VTs, Op1, Op2));
@@ -2938,9 +3079,9 @@ SelectionDAGBuilder::visitExp(const CallInst &I) {
   SDValue result;
   DebugLoc dl = getCurDebugLoc();
 
-  if (getValue(I.getOperand(1)).getValueType() == MVT::f32 &&
+  if (getValue(I.getArgOperand(0)).getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
-    SDValue Op = getValue(I.getOperand(1));
+    SDValue Op = getValue(I.getArgOperand(0));
 
     // Put the exponent in the right bit position for later addition to the
     // final result:
@@ -3050,8 +3191,8 @@ SelectionDAGBuilder::visitExp(const CallInst &I) {
   } else {
     // No special expansion.
     result = DAG.getNode(ISD::FEXP, dl,
-                         getValue(I.getOperand(1)).getValueType(),
-                         getValue(I.getOperand(1)));
+                         getValue(I.getArgOperand(0)).getValueType(),
+                         getValue(I.getArgOperand(0)));
   }
 
   setValue(&I, result);
@@ -3064,9 +3205,9 @@ SelectionDAGBuilder::visitLog(const CallInst &I) {
   SDValue result;
   DebugLoc dl = getCurDebugLoc();
 
-  if (getValue(I.getOperand(1)).getValueType() == MVT::f32 &&
+  if (getValue(I.getArgOperand(0)).getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
-    SDValue Op = getValue(I.getOperand(1));
+    SDValue Op = getValue(I.getArgOperand(0));
     SDValue Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op);
 
     // Scale the exponent by log(2) [0.69314718f].
@@ -3160,8 +3301,8 @@ SelectionDAGBuilder::visitLog(const CallInst &I) {
   } else {
     // No special expansion.
     result = DAG.getNode(ISD::FLOG, dl,
-                         getValue(I.getOperand(1)).getValueType(),
-                         getValue(I.getOperand(1)));
+                         getValue(I.getArgOperand(0)).getValueType(),
+                         getValue(I.getArgOperand(0)));
   }
 
   setValue(&I, result);
@@ -3174,9 +3315,9 @@ SelectionDAGBuilder::visitLog2(const CallInst &I) {
   SDValue result;
   DebugLoc dl = getCurDebugLoc();
 
-  if (getValue(I.getOperand(1)).getValueType() == MVT::f32 &&
+  if (getValue(I.getArgOperand(0)).getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
-    SDValue Op = getValue(I.getOperand(1));
+    SDValue Op = getValue(I.getArgOperand(0));
     SDValue Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op);
 
     // Get the exponent.
@@ -3269,8 +3410,8 @@ SelectionDAGBuilder::visitLog2(const CallInst &I) {
   } else {
     // No special expansion.
     result = DAG.getNode(ISD::FLOG2, dl,
-                         getValue(I.getOperand(1)).getValueType(),
-                         getValue(I.getOperand(1)));
+                         getValue(I.getArgOperand(0)).getValueType(),
+                         getValue(I.getArgOperand(0)));
   }
 
   setValue(&I, result);
@@ -3283,9 +3424,9 @@ SelectionDAGBuilder::visitLog10(const CallInst &I) {
   SDValue result;
   DebugLoc dl = getCurDebugLoc();
 
-  if (getValue(I.getOperand(1)).getValueType() == MVT::f32 &&
+  if (getValue(I.getArgOperand(0)).getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
-    SDValue Op = getValue(I.getOperand(1));
+    SDValue Op = getValue(I.getArgOperand(0));
     SDValue Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op);
 
     // Scale the exponent by log10(2) [0.30102999f].
@@ -3371,8 +3512,8 @@ SelectionDAGBuilder::visitLog10(const CallInst &I) {
   } else {
     // No special expansion.
     result = DAG.getNode(ISD::FLOG10, dl,
-                         getValue(I.getOperand(1)).getValueType(),
-                         getValue(I.getOperand(1)));
+                         getValue(I.getArgOperand(0)).getValueType(),
+                         getValue(I.getArgOperand(0)));
   }
 
   setValue(&I, result);
@@ -3385,9 +3526,9 @@ SelectionDAGBuilder::visitExp2(const CallInst &I) {
   SDValue result;
   DebugLoc dl = getCurDebugLoc();
 
-  if (getValue(I.getOperand(1)).getValueType() == MVT::f32 &&
+  if (getValue(I.getArgOperand(0)).getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
-    SDValue Op = getValue(I.getOperand(1));
+    SDValue Op = getValue(I.getArgOperand(0));
 
     SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Op);
 
@@ -3485,8 +3626,8 @@ SelectionDAGBuilder::visitExp2(const CallInst &I) {
   } else {
     // No special expansion.
     result = DAG.getNode(ISD::FEXP2, dl,
-                         getValue(I.getOperand(1)).getValueType(),
-                         getValue(I.getOperand(1)));
+                         getValue(I.getArgOperand(0)).getValueType(),
+                         getValue(I.getArgOperand(0)));
   }
 
   setValue(&I, result);
@@ -3497,12 +3638,12 @@ SelectionDAGBuilder::visitExp2(const CallInst &I) {
 void
 SelectionDAGBuilder::visitPow(const CallInst &I) {
   SDValue result;
-  const Value *Val = I.getOperand(1);
+  const Value *Val = I.getArgOperand(0);
   DebugLoc dl = getCurDebugLoc();
   bool IsExp10 = false;
 
   if (getValue(Val).getValueType() == MVT::f32 &&
-      getValue(I.getOperand(2)).getValueType() == MVT::f32 &&
+      getValue(I.getArgOperand(1)).getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
     if (Constant *C = const_cast<Constant*>(dyn_cast<Constant>(Val))) {
       if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
@@ -3513,7 +3654,7 @@ SelectionDAGBuilder::visitPow(const CallInst &I) {
   }
 
   if (IsExp10 && LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
-    SDValue Op = getValue(I.getOperand(2));
+    SDValue Op = getValue(I.getArgOperand(1));
 
     // Put the exponent in the right bit position for later addition to the
     // final result:
@@ -3618,9 +3759,9 @@ SelectionDAGBuilder::visitPow(const CallInst &I) {
   } else {
     // No special expansion.
     result = DAG.getNode(ISD::FPOW, dl,
-                         getValue(I.getOperand(1)).getValueType(),
-                         getValue(I.getOperand(1)),
-                         getValue(I.getOperand(2)));
+                         getValue(I.getArgOperand(0)).getValueType(),
+                         getValue(I.getArgOperand(0)),
+                         getValue(I.getArgOperand(1)));
   }
 
   setValue(&I, result);
@@ -3696,7 +3837,7 @@ SelectionDAGBuilder::EmitFuncArgumentDbgValue(const DbgValueInst &DI,
   if (DV.isInlinedFnArgument(MF.getFunction()))
     return false;
 
-  MachineBasicBlock *MBB = FuncInfo.MBBMap[DI.getParent()];
+  MachineBasicBlock *MBB = FuncInfo.MBB;
   if (MBB != &MF.front())
     return false;
 
@@ -3750,11 +3891,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::vacopy:   visitVACopy(I); return 0;
   case Intrinsic::returnaddress:
     setValue(&I, DAG.getNode(ISD::RETURNADDR, dl, TLI.getPointerTy(),
-                             getValue(I.getOperand(1))));
+                             getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::frameaddress:
     setValue(&I, DAG.getNode(ISD::FRAMEADDR, dl, TLI.getPointerTy(),
-                             getValue(I.getOperand(1))));
+                             getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::setjmp:
     return "_setjmp"+!TLI.usesUnderscoreSetJmp();
@@ -3763,63 +3904,63 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::memcpy: {
     // Assert for address < 256 since we support only user defined address
     // spaces.
-    assert(cast<PointerType>(I.getOperand(1)->getType())->getAddressSpace()
+    assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
            < 256 &&
-           cast<PointerType>(I.getOperand(2)->getType())->getAddressSpace()
+           cast<PointerType>(I.getArgOperand(1)->getType())->getAddressSpace()
            < 256 &&
            "Unknown address space");
-    SDValue Op1 = getValue(I.getOperand(1));
-    SDValue Op2 = getValue(I.getOperand(2));
-    SDValue Op3 = getValue(I.getOperand(3));
-    unsigned Align = cast<ConstantInt>(I.getOperand(4))->getZExtValue();
-    bool isVol = cast<ConstantInt>(I.getOperand(5))->getZExtValue();
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
     DAG.setRoot(DAG.getMemcpy(getRoot(), dl, Op1, Op2, Op3, Align, isVol, false,
-                              I.getOperand(1), 0, I.getOperand(2), 0));
+                              I.getArgOperand(0), 0, I.getArgOperand(1), 0));
     return 0;
   }
   case Intrinsic::memset: {
     // Assert for address < 256 since we support only user defined address
     // spaces.
-    assert(cast<PointerType>(I.getOperand(1)->getType())->getAddressSpace()
+    assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
            < 256 &&
            "Unknown address space");
-    SDValue Op1 = getValue(I.getOperand(1));
-    SDValue Op2 = getValue(I.getOperand(2));
-    SDValue Op3 = getValue(I.getOperand(3));
-    unsigned Align = cast<ConstantInt>(I.getOperand(4))->getZExtValue();
-    bool isVol = cast<ConstantInt>(I.getOperand(5))->getZExtValue();
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
     DAG.setRoot(DAG.getMemset(getRoot(), dl, Op1, Op2, Op3, Align, isVol,
-                              I.getOperand(1), 0));
+                              I.getArgOperand(0), 0));
     return 0;
   }
   case Intrinsic::memmove: {
     // Assert for address < 256 since we support only user defined address
     // spaces.
-    assert(cast<PointerType>(I.getOperand(1)->getType())->getAddressSpace()
+    assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
            < 256 &&
-           cast<PointerType>(I.getOperand(2)->getType())->getAddressSpace()
+           cast<PointerType>(I.getArgOperand(1)->getType())->getAddressSpace()
            < 256 &&
            "Unknown address space");
-    SDValue Op1 = getValue(I.getOperand(1));
-    SDValue Op2 = getValue(I.getOperand(2));
-    SDValue Op3 = getValue(I.getOperand(3));
-    unsigned Align = cast<ConstantInt>(I.getOperand(4))->getZExtValue();
-    bool isVol = cast<ConstantInt>(I.getOperand(5))->getZExtValue();
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
 
     // If the source and destination are known to not be aliases, we can
     // lower memmove as memcpy.
     uint64_t Size = -1ULL;
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op3))
       Size = C->getZExtValue();
-    if (AA->alias(I.getOperand(1), Size, I.getOperand(2), Size) ==
+    if (AA->alias(I.getArgOperand(0), Size, I.getArgOperand(1), Size) ==
         AliasAnalysis::NoAlias) {
       DAG.setRoot(DAG.getMemcpy(getRoot(), dl, Op1, Op2, Op3, Align, isVol, 
-                                false, I.getOperand(1), 0, I.getOperand(2), 0));
+                                false, I.getArgOperand(0), 0, I.getArgOperand(1), 0));
       return 0;
     }
 
     DAG.setRoot(DAG.getMemmove(getRoot(), dl, Op1, Op2, Op3, Align, isVol,
-                               I.getOperand(1), 0, I.getOperand(2), 0));
+                               I.getArgOperand(0), 0, I.getArgOperand(1), 0));
     return 0;
   }
   case Intrinsic::dbg_declare: {
@@ -3908,7 +4049,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     } else {
       bool createUndef = false;
       // FIXME : Why not use getValue() directly ?
-      SDValue &N = NodeMap[V];
+      SDValue N = NodeMap[V];
+      if (!N.getNode() && isa<Argument>(V))
+        // Check unused arguments map.
+        N = UnusedArgNodeMap[V];
       if (N.getNode()) {
         if (!EmitFuncArgumentDbgValue(DI, V, Variable, Offset, N)) {
           SDV = DAG.getDbgValue(Variable, N.getNode(),
@@ -3956,7 +4100,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::eh_exception: {
     // Insert the EXCEPTIONADDR instruction.
-    assert(FuncInfo.MBBMap[I.getParent()]->isLandingPad() &&
+    assert(FuncInfo.MBB->isLandingPad() &&
            "Call to eh.exception not in landing pad!");
     SDVTList VTs = DAG.getVTList(TLI.getPointerTy(), MVT::Other);
     SDValue Ops[1];
@@ -3968,7 +4112,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
 
   case Intrinsic::eh_selector: {
-    MachineBasicBlock *CallMBB = FuncInfo.MBBMap[I.getParent()];
+    MachineBasicBlock *CallMBB = FuncInfo.MBB;
     MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
     if (CallMBB->isLandingPad())
       AddCatchInfo(I, &MMI, CallMBB);
@@ -3978,13 +4122,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
 #endif
       // FIXME: Mark exception selector register as live in.  Hack for PR1508.
       unsigned Reg = TLI.getExceptionSelectorRegister();
-      if (Reg) FuncInfo.MBBMap[I.getParent()]->addLiveIn(Reg);
+      if (Reg) FuncInfo.MBB->addLiveIn(Reg);
     }
 
     // Insert the EHSELECTION instruction.
     SDVTList VTs = DAG.getVTList(TLI.getPointerTy(), MVT::Other);
     SDValue Ops[2];
-    Ops[0] = getValue(I.getOperand(1));
+    Ops[0] = getValue(I.getArgOperand(0));
     Ops[1] = getRoot();
     SDValue Op = DAG.getNode(ISD::EHSELECTION, dl, VTs, Ops, 2);
     DAG.setRoot(Op.getValue(1));
@@ -3994,7 +4138,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
 
   case Intrinsic::eh_typeid_for: {
     // Find the type id for the given typeinfo.
-    GlobalVariable *GV = ExtractTypeInfo(I.getOperand(1));
+    GlobalVariable *GV = ExtractTypeInfo(I.getArgOperand(0));
     unsigned TypeID = DAG.getMachineFunction().getMMI().getTypeIDFor(GV);
     Res = DAG.getConstant(TypeID, MVT::i32);
     setValue(&I, Res);
@@ -4007,15 +4151,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     DAG.setRoot(DAG.getNode(ISD::EH_RETURN, dl,
                             MVT::Other,
                             getControlRoot(),
-                            getValue(I.getOperand(1)),
-                            getValue(I.getOperand(2))));
+                            getValue(I.getArgOperand(0)),
+                            getValue(I.getArgOperand(1))));
     return 0;
   case Intrinsic::eh_unwind_init:
     DAG.getMachineFunction().getMMI().setCallsUnwindInit(true);
     return 0;
   case Intrinsic::eh_dwarf_cfa: {
-    EVT VT = getValue(I.getOperand(1)).getValueType();
-    SDValue CfaArg = DAG.getSExtOrTrunc(getValue(I.getOperand(1)), dl,
+    SDValue CfaArg = DAG.getSExtOrTrunc(getValue(I.getArgOperand(0)), dl,
                                         TLI.getPointerTy());
     SDValue Offset = DAG.getNode(ISD::ADD, dl,
                                  TLI.getPointerTy(),
@@ -4031,7 +4174,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::eh_sjlj_callsite: {
     MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
-    ConstantInt *CI = dyn_cast<ConstantInt>(I.getOperand(1));
+    ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(0));
     assert(CI && "Non-constant call site value in eh.sjlj.callsite!");
     assert(MMI.getCurrentCallSite() == 0 && "Overlapping call sites!");
 
@@ -4040,13 +4183,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::eh_sjlj_setjmp: {
     setValue(&I, DAG.getNode(ISD::EH_SJLJ_SETJMP, dl, MVT::i32, getRoot(),
-                             getValue(I.getOperand(1))));
+                             getValue(I.getArgOperand(0))));
     return 0;
   }
   case Intrinsic::eh_sjlj_longjmp: {
     DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_LONGJMP, dl, MVT::Other,
                             getRoot(),
-                            getValue(I.getOperand(1))));
+                            getValue(I.getArgOperand(0))));
     return 0;
   }
 
@@ -4072,34 +4215,34 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     case Intrinsic::convertuu:  Code = ISD::CVT_UU; break;
     }
     EVT DestVT = TLI.getValueType(I.getType());
-    const Value *Op1 = I.getOperand(1);
+    const Value *Op1 = I.getArgOperand(0);
     Res = DAG.getConvertRndSat(DestVT, getCurDebugLoc(), getValue(Op1),
                                DAG.getValueType(DestVT),
                                DAG.getValueType(getValue(Op1).getValueType()),
-                               getValue(I.getOperand(2)),
-                               getValue(I.getOperand(3)),
+                               getValue(I.getArgOperand(1)),
+                               getValue(I.getArgOperand(2)),
                                Code);
     setValue(&I, Res);
     return 0;
   }
   case Intrinsic::sqrt:
     setValue(&I, DAG.getNode(ISD::FSQRT, dl,
-                             getValue(I.getOperand(1)).getValueType(),
-                             getValue(I.getOperand(1))));
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::powi:
-    setValue(&I, ExpandPowI(dl, getValue(I.getOperand(1)),
-                            getValue(I.getOperand(2)), DAG));
+    setValue(&I, ExpandPowI(dl, getValue(I.getArgOperand(0)),
+                            getValue(I.getArgOperand(1)), DAG));
     return 0;
   case Intrinsic::sin:
     setValue(&I, DAG.getNode(ISD::FSIN, dl,
-                             getValue(I.getOperand(1)).getValueType(),
-                             getValue(I.getOperand(1))));
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::cos:
     setValue(&I, DAG.getNode(ISD::FCOS, dl,
-                             getValue(I.getOperand(1)).getValueType(),
-                             getValue(I.getOperand(1))));
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::log:
     visitLog(I);
@@ -4121,14 +4264,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return 0;
   case Intrinsic::convert_to_fp16:
     setValue(&I, DAG.getNode(ISD::FP32_TO_FP16, dl,
-                             MVT::i16, getValue(I.getOperand(1))));
+                             MVT::i16, getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::convert_from_fp16:
     setValue(&I, DAG.getNode(ISD::FP16_TO_FP32, dl,
-                             MVT::f32, getValue(I.getOperand(1))));
+                             MVT::f32, getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::pcmarker: {
-    SDValue Tmp = getValue(I.getOperand(1));
+    SDValue Tmp = getValue(I.getArgOperand(0));
     DAG.setRoot(DAG.getNode(ISD::PCMARKER, dl, MVT::Other, getRoot(), Tmp));
     return 0;
   }
@@ -4143,23 +4286,23 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::bswap:
     setValue(&I, DAG.getNode(ISD::BSWAP, dl,
-                             getValue(I.getOperand(1)).getValueType(),
-                             getValue(I.getOperand(1))));
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::cttz: {
-    SDValue Arg = getValue(I.getOperand(1));
+    SDValue Arg = getValue(I.getArgOperand(0));
     EVT Ty = Arg.getValueType();
     setValue(&I, DAG.getNode(ISD::CTTZ, dl, Ty, Arg));
     return 0;
   }
   case Intrinsic::ctlz: {
-    SDValue Arg = getValue(I.getOperand(1));
+    SDValue Arg = getValue(I.getArgOperand(0));
     EVT Ty = Arg.getValueType();
     setValue(&I, DAG.getNode(ISD::CTLZ, dl, Ty, Arg));
     return 0;
   }
   case Intrinsic::ctpop: {
-    SDValue Arg = getValue(I.getOperand(1));
+    SDValue Arg = getValue(I.getArgOperand(0));
     EVT Ty = Arg.getValueType();
     setValue(&I, DAG.getNode(ISD::CTPOP, dl, Ty, Arg));
     return 0;
@@ -4173,7 +4316,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return 0;
   }
   case Intrinsic::stackrestore: {
-    Res = getValue(I.getOperand(1));
+    Res = getValue(I.getArgOperand(0));
     DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, dl, MVT::Other, getRoot(), Res));
     return 0;
   }
@@ -4183,8 +4326,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     MachineFrameInfo *MFI = MF.getFrameInfo();
     EVT PtrTy = TLI.getPointerTy();
 
-    SDValue Src = getValue(I.getOperand(1));   // The guard's value.
-    AllocaInst *Slot = cast<AllocaInst>(I.getOperand(2));
+    SDValue Src = getValue(I.getArgOperand(0));   // The guard's value.
+    AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1));
 
     int FI = FuncInfo.StaticAllocaMap[Slot];
     MFI->setStackProtectorIndex(FI);
@@ -4201,14 +4344,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::objectsize: {
     // If we don't know by now, we're never going to know.
-    ConstantInt *CI = dyn_cast<ConstantInt>(I.getOperand(2));
+    ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(1));
 
     assert(CI && "Non-constant type in __builtin_object_size?");
 
-    SDValue Arg = getValue(I.getOperand(0));
+    SDValue Arg = getValue(I.getCalledValue());
     EVT Ty = Arg.getValueType();
 
-    if (CI->getZExtValue() == 0)
+    if (CI->isZero())
       Res = DAG.getConstant(-1ULL, Ty);
     else
       Res = DAG.getConstant(0, Ty);
@@ -4221,14 +4364,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return 0;
 
   case Intrinsic::init_trampoline: {
-    const Function *F = cast<Function>(I.getOperand(2)->stripPointerCasts());
+    const Function *F = cast<Function>(I.getArgOperand(1)->stripPointerCasts());
 
     SDValue Ops[6];
     Ops[0] = getRoot();
-    Ops[1] = getValue(I.getOperand(1));
-    Ops[2] = getValue(I.getOperand(2));
-    Ops[3] = getValue(I.getOperand(3));
-    Ops[4] = DAG.getSrcValue(I.getOperand(1));
+    Ops[1] = getValue(I.getArgOperand(0));
+    Ops[2] = getValue(I.getArgOperand(1));
+    Ops[3] = getValue(I.getArgOperand(2));
+    Ops[4] = DAG.getSrcValue(I.getArgOperand(0));
     Ops[5] = DAG.getSrcValue(F);
 
     Res = DAG.getNode(ISD::TRAMPOLINE, dl,
@@ -4241,8 +4384,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::gcroot:
     if (GFI) {
-      const Value *Alloca = I.getOperand(1);
-      const Constant *TypeMap = cast<Constant>(I.getOperand(2));
+      const Value *Alloca = I.getArgOperand(0);
+      const Constant *TypeMap = cast<Constant>(I.getArgOperand(1));
 
       FrameIndexSDNode *FI = cast<FrameIndexSDNode>(getValue(Alloca).getNode());
       GFI->addStackRoot(FI->getIndex(), TypeMap);
@@ -4274,9 +4417,9 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::prefetch: {
     SDValue Ops[4];
     Ops[0] = getRoot();
-    Ops[1] = getValue(I.getOperand(1));
-    Ops[2] = getValue(I.getOperand(2));
-    Ops[3] = getValue(I.getOperand(3));
+    Ops[1] = getValue(I.getArgOperand(0));
+    Ops[2] = getValue(I.getArgOperand(1));
+    Ops[3] = getValue(I.getArgOperand(2));
     DAG.setRoot(DAG.getNode(ISD::PREFETCH, dl, MVT::Other, &Ops[0], 4));
     return 0;
   }
@@ -4285,7 +4428,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue Ops[6];
     Ops[0] = getRoot();
     for (int x = 1; x < 6; ++x)
-      Ops[x] = getValue(I.getOperand(x));
+      Ops[x] = getValue(I.getArgOperand(x - 1));
 
     DAG.setRoot(DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, &Ops[0], 6));
     return 0;
@@ -4294,12 +4437,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue Root = getRoot();
     SDValue L =
       DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, getCurDebugLoc(),
-                    getValue(I.getOperand(2)).getValueType().getSimpleVT(),
+                    getValue(I.getArgOperand(1)).getValueType().getSimpleVT(),
                     Root,
-                    getValue(I.getOperand(1)),
-                    getValue(I.getOperand(2)),
-                    getValue(I.getOperand(3)),
-                    I.getOperand(1));
+                    getValue(I.getArgOperand(0)),
+                    getValue(I.getArgOperand(1)),
+                    getValue(I.getArgOperand(2)),
+                    I.getArgOperand(0));
     setValue(&I, L);
     DAG.setRoot(L.getValue(1));
     return 0;
@@ -4353,14 +4496,13 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   Args.reserve(CS.arg_size());
 
   // Check whether the function can return without sret-demotion.
-  SmallVector<EVT, 4> OutVTs;
-  SmallVector<ISD::ArgFlagsTy, 4> OutsFlags;
+  SmallVector<ISD::OutputArg, 4> Outs;
   SmallVector<uint64_t, 4> Offsets;
-  getReturnInfo(RetTy, CS.getAttributes().getRetAttributes(),
-                OutVTs, OutsFlags, TLI, &Offsets);
+  GetReturnInfo(RetTy, CS.getAttributes().getRetAttributes(),
+                Outs, TLI, &Offsets);
 
   bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(),
-                        FTy->isVarArg(), OutVTs, OutsFlags, DAG);
+                        FTy->isVarArg(), Outs, FTy->getContext());
 
   SDValue DemoteStackSlot;
 
@@ -4453,7 +4595,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     ComputeValueVTs(TLI, PtrRetTy, PVTs);
     assert(PVTs.size() == 1 && "Pointers should fit in one register");
     EVT PtrVT = PVTs[0];
-    unsigned NumValues = OutVTs.size();
+    unsigned NumValues = Outs.size();
     SmallVector<SDValue, 4> Values(NumValues);
     SmallVector<SDValue, 4> Chains(NumValues);
 
@@ -4461,7 +4603,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
       SDValue Add = DAG.getNode(ISD::ADD, getCurDebugLoc(), PtrVT,
                                 DemoteStackSlot,
                                 DAG.getConstant(Offsets[i], PtrVT));
-      SDValue L = DAG.getLoad(OutVTs[i], getCurDebugLoc(), Result.second,
+      SDValue L = DAG.getLoad(Outs[i].VT, getCurDebugLoc(), Result.second,
                               Add, NULL, Offsets[i], false, false, 1);
       Values[i] = L;
       Chains[i] = L.getValue(1);
@@ -4580,16 +4722,16 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
 /// lowered like a normal call.
 bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
   // Verify that the prototype makes sense.  int memcmp(void*,void*,size_t)
-  if (I.getNumOperands() != 4)
+  if (I.getNumArgOperands() != 3)
     return false;
 
-  const Value *LHS = I.getOperand(1), *RHS = I.getOperand(2);
+  const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1);
   if (!LHS->getType()->isPointerTy() || !RHS->getType()->isPointerTy() ||
-      !I.getOperand(3)->getType()->isIntegerTy() ||
+      !I.getArgOperand(2)->getType()->isIntegerTy() ||
       !I.getType()->isIntegerTy())
     return false;
 
-  const ConstantInt *Size = dyn_cast<ConstantInt>(I.getOperand(3));
+  const ConstantInt *Size = dyn_cast<ConstantInt>(I.getArgOperand(2));
 
   // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
   // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
@@ -4656,11 +4798,16 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
 
 
 void SelectionDAGBuilder::visitCall(const CallInst &I) {
+  // Handle inline assembly differently.
+  if (isa<InlineAsm>(I.getCalledValue())) {
+    visitInlineAsm(&I);
+    return;
+  }
+  
   const char *RenameFn = 0;
   if (Function *F = I.getCalledFunction()) {
     if (F->isDeclaration()) {
-      const TargetIntrinsicInfo *II = TM.getIntrinsicInfo();
-      if (II) {
+      if (const TargetIntrinsicInfo *II = TM.getIntrinsicInfo()) {
         if (unsigned IID = II->getIntrinsicID(F)) {
           RenameFn = visitIntrinsicCall(I, IID);
           if (!RenameFn)
@@ -4679,51 +4826,51 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
     if (!F->hasLocalLinkage() && F->hasName()) {
       StringRef Name = F->getName();
       if (Name == "copysign" || Name == "copysignf" || Name == "copysignl") {
-        if (I.getNumOperands() == 3 &&   // Basic sanity checks.
-            I.getOperand(1)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getOperand(1)->getType() &&
-            I.getType() == I.getOperand(2)->getType()) {
-          SDValue LHS = getValue(I.getOperand(1));
-          SDValue RHS = getValue(I.getOperand(2));
+        if (I.getNumArgOperands() == 2 &&   // Basic sanity checks.
+            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
+            I.getType() == I.getArgOperand(0)->getType() &&
+            I.getType() == I.getArgOperand(1)->getType()) {
+          SDValue LHS = getValue(I.getArgOperand(0));
+          SDValue RHS = getValue(I.getArgOperand(1));
           setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurDebugLoc(),
                                    LHS.getValueType(), LHS, RHS));
           return;
         }
       } else if (Name == "fabs" || Name == "fabsf" || Name == "fabsl") {
-        if (I.getNumOperands() == 2 &&   // Basic sanity checks.
-            I.getOperand(1)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getOperand(1)->getType()) {
-          SDValue Tmp = getValue(I.getOperand(1));
+        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
+            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
+            I.getType() == I.getArgOperand(0)->getType()) {
+          SDValue Tmp = getValue(I.getArgOperand(0));
           setValue(&I, DAG.getNode(ISD::FABS, getCurDebugLoc(),
                                    Tmp.getValueType(), Tmp));
           return;
         }
       } else if (Name == "sin" || Name == "sinf" || Name == "sinl") {
-        if (I.getNumOperands() == 2 &&   // Basic sanity checks.
-            I.getOperand(1)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getOperand(1)->getType() &&
+        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
+            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
+            I.getType() == I.getArgOperand(0)->getType() &&
             I.onlyReadsMemory()) {
-          SDValue Tmp = getValue(I.getOperand(1));
+          SDValue Tmp = getValue(I.getArgOperand(0));
           setValue(&I, DAG.getNode(ISD::FSIN, getCurDebugLoc(),
                                    Tmp.getValueType(), Tmp));
           return;
         }
       } else if (Name == "cos" || Name == "cosf" || Name == "cosl") {
-        if (I.getNumOperands() == 2 &&   // Basic sanity checks.
-            I.getOperand(1)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getOperand(1)->getType() &&
+        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
+            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
+            I.getType() == I.getArgOperand(0)->getType() &&
             I.onlyReadsMemory()) {
-          SDValue Tmp = getValue(I.getOperand(1));
+          SDValue Tmp = getValue(I.getArgOperand(0));
           setValue(&I, DAG.getNode(ISD::FCOS, getCurDebugLoc(),
                                    Tmp.getValueType(), Tmp));
           return;
         }
       } else if (Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl") {
-        if (I.getNumOperands() == 2 &&   // Basic sanity checks.
-            I.getOperand(1)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getOperand(1)->getType() &&
+        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
+            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
+            I.getType() == I.getArgOperand(0)->getType() &&
             I.onlyReadsMemory()) {
-          SDValue Tmp = getValue(I.getOperand(1));
+          SDValue Tmp = getValue(I.getArgOperand(0));
           setValue(&I, DAG.getNode(ISD::FSQRT, getCurDebugLoc(),
                                    Tmp.getValueType(), Tmp));
           return;
@@ -4733,14 +4880,11 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
           return;
       }
     }
-  } else if (isa<InlineAsm>(I.getOperand(0))) {
-    visitInlineAsm(&I);
-    return;
   }
-
+  
   SDValue Callee;
   if (!RenameFn)
-    Callee = getValue(I.getOperand(0));
+    Callee = getValue(I.getCalledValue());
   else
     Callee = DAG.getExternalSymbol(RenameFn, TLI.getPointerTy());
 
@@ -4749,210 +4893,8 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
   LowerCallTo(&I, Callee, I.isTailCall());
 }
 
-/// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
-/// this value and returns the result as a ValueVT value.  This uses
-/// Chain/Flag as the input and updates them for the output Chain/Flag.
-/// If the Flag pointer is NULL, no flag is used.
-SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, DebugLoc dl,
-                                      SDValue &Chain, SDValue *Flag) const {
-  // Assemble the legal parts into the final values.
-  SmallVector<SDValue, 4> Values(ValueVTs.size());
-  SmallVector<SDValue, 8> Parts;
-  for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
-    // Copy the legal parts from the registers.
-    EVT ValueVT = ValueVTs[Value];
-    unsigned NumRegs = TLI->getNumRegisters(*DAG.getContext(), ValueVT);
-    EVT RegisterVT = RegVTs[Value];
-
-    Parts.resize(NumRegs);
-    for (unsigned i = 0; i != NumRegs; ++i) {
-      SDValue P;
-      if (Flag == 0) {
-        P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT);
-      } else {
-        P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT, *Flag);
-        *Flag = P.getValue(2);
-      }
-
-      Chain = P.getValue(1);
-
-      // If the source register was virtual and if we know something about it,
-      // add an assert node.
-      if (TargetRegisterInfo::isVirtualRegister(Regs[Part+i]) &&
-          RegisterVT.isInteger() && !RegisterVT.isVector()) {
-        unsigned SlotNo = Regs[Part+i]-TargetRegisterInfo::FirstVirtualRegister;
-        FunctionLoweringInfo &FLI = DAG.getFunctionLoweringInfo();
-        if (FLI.LiveOutRegInfo.size() > SlotNo) {
-          FunctionLoweringInfo::LiveOutInfo &LOI = FLI.LiveOutRegInfo[SlotNo];
-
-          unsigned RegSize = RegisterVT.getSizeInBits();
-          unsigned NumSignBits = LOI.NumSignBits;
-          unsigned NumZeroBits = LOI.KnownZero.countLeadingOnes();
-
-          // FIXME: We capture more information than the dag can represent.  For
-          // now, just use the tightest assertzext/assertsext possible.
-          bool isSExt = true;
-          EVT FromVT(MVT::Other);
-          if (NumSignBits == RegSize)
-            isSExt = true, FromVT = MVT::i1;   // ASSERT SEXT 1
-          else if (NumZeroBits >= RegSize-1)
-            isSExt = false, FromVT = MVT::i1;  // ASSERT ZEXT 1
-          else if (NumSignBits > RegSize-8)
-            isSExt = true, FromVT = MVT::i8;   // ASSERT SEXT 8
-          else if (NumZeroBits >= RegSize-8)
-            isSExt = false, FromVT = MVT::i8;  // ASSERT ZEXT 8
-          else if (NumSignBits > RegSize-16)
-            isSExt = true, FromVT = MVT::i16;  // ASSERT SEXT 16
-          else if (NumZeroBits >= RegSize-16)
-            isSExt = false, FromVT = MVT::i16; // ASSERT ZEXT 16
-          else if (NumSignBits > RegSize-32)
-            isSExt = true, FromVT = MVT::i32;  // ASSERT SEXT 32
-          else if (NumZeroBits >= RegSize-32)
-            isSExt = false, FromVT = MVT::i32; // ASSERT ZEXT 32
-
-          if (FromVT != MVT::Other)
-            P = DAG.getNode(isSExt ? ISD::AssertSext : ISD::AssertZext, dl,
-                            RegisterVT, P, DAG.getValueType(FromVT));
-        }
-      }
-
-      Parts[i] = P;
-    }
-
-    Values[Value] = getCopyFromParts(DAG, dl, Parts.begin(),
-                                     NumRegs, RegisterVT, ValueVT);
-    Part += NumRegs;
-    Parts.clear();
-  }
-
-  return DAG.getNode(ISD::MERGE_VALUES, dl,
-                     DAG.getVTList(&ValueVTs[0], ValueVTs.size()),
-                     &Values[0], ValueVTs.size());
-}
-
-/// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
-/// specified value into the registers specified by this object.  This uses
-/// Chain/Flag as the input and updates them for the output Chain/Flag.
-/// If the Flag pointer is NULL, no flag is used.
-void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
-                                 SDValue &Chain, SDValue *Flag) const {
-  // Get the list of the values's legal parts.
-  unsigned NumRegs = Regs.size();
-  SmallVector<SDValue, 8> Parts(NumRegs);
-  for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
-    EVT ValueVT = ValueVTs[Value];
-    unsigned NumParts = TLI->getNumRegisters(*DAG.getContext(), ValueVT);
-    EVT RegisterVT = RegVTs[Value];
-
-    getCopyToParts(DAG, dl,
-                   Val.getValue(Val.getResNo() + Value),
-                   &Parts[Part], NumParts, RegisterVT);
-    Part += NumParts;
-  }
-
-  // Copy the parts into the registers.
-  SmallVector<SDValue, 8> Chains(NumRegs);
-  for (unsigned i = 0; i != NumRegs; ++i) {
-    SDValue Part;
-    if (Flag == 0) {
-      Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i]);
-    } else {
-      Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i], *Flag);
-      *Flag = Part.getValue(1);
-    }
-
-    Chains[i] = Part.getValue(0);
-  }
-
-  if (NumRegs == 1 || Flag)
-    // If NumRegs > 1 && Flag is used then the use of the last CopyToReg is
-    // flagged to it. That is the CopyToReg nodes and the user are considered
-    // a single scheduling unit. If we create a TokenFactor and return it as
-    // chain, then the TokenFactor is both a predecessor (operand) of the
-    // user as well as a successor (the TF operands are flagged to the user).
-    // c1, f1 = CopyToReg
-    // c2, f2 = CopyToReg
-    // c3     = TokenFactor c1, c2
-    // ...
-    //        = op c3, ..., f2
-    Chain = Chains[NumRegs-1];
-  else
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], NumRegs);
-}
-
-/// AddInlineAsmOperands - Add this value to the specified inlineasm node
-/// operand list.  This adds the code marker and includes the number of
-/// values added into it.
-void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
-                                        unsigned MatchingIdx,
-                                        SelectionDAG &DAG,
-                                        std::vector<SDValue> &Ops) const {
-  unsigned Flag = InlineAsm::getFlagWord(Code, Regs.size());
-  if (HasMatching)
-    Flag = InlineAsm::getFlagWordForMatchingOp(Flag, MatchingIdx);
-  SDValue Res = DAG.getTargetConstant(Flag, MVT::i32);
-  Ops.push_back(Res);
-
-  for (unsigned Value = 0, Reg = 0, e = ValueVTs.size(); Value != e; ++Value) {
-    unsigned NumRegs = TLI->getNumRegisters(*DAG.getContext(), ValueVTs[Value]);
-    EVT RegisterVT = RegVTs[Value];
-    for (unsigned i = 0; i != NumRegs; ++i) {
-      assert(Reg < Regs.size() && "Mismatch in # registers expected");
-      Ops.push_back(DAG.getRegister(Regs[Reg++], RegisterVT));
-    }
-  }
-}
-
-/// isAllocatableRegister - If the specified register is safe to allocate,
-/// i.e. it isn't a stack pointer or some other special register, return the
-/// register class for the register.  Otherwise, return null.
-static const TargetRegisterClass *
-isAllocatableRegister(unsigned Reg, MachineFunction &MF,
-                      const TargetLowering &TLI,
-                      const TargetRegisterInfo *TRI) {
-  EVT FoundVT = MVT::Other;
-  const TargetRegisterClass *FoundRC = 0;
-  for (TargetRegisterInfo::regclass_iterator RCI = TRI->regclass_begin(),
-       E = TRI->regclass_end(); RCI != E; ++RCI) {
-    EVT ThisVT = MVT::Other;
-
-    const TargetRegisterClass *RC = *RCI;
-    // If none of the value types for this register class are valid, we
-    // can't use it.  For example, 64-bit reg classes on 32-bit targets.
-    for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end();
-         I != E; ++I) {
-      if (TLI.isTypeLegal(*I)) {
-        // If we have already found this register in a different register class,
-        // choose the one with the largest VT specified.  For example, on
-        // PowerPC, we favor f64 register classes over f32.
-        if (FoundVT == MVT::Other || FoundVT.bitsLT(*I)) {
-          ThisVT = *I;
-          break;
-        }
-      }
-    }
-
-    if (ThisVT == MVT::Other) continue;
-
-    // NOTE: This isn't ideal.  In particular, this might allocate the
-    // frame pointer in functions that need it (due to them not being taken
-    // out of allocation, because a variable sized allocation hasn't been seen
-    // yet).  This is a slight code pessimization, but should still work.
-    for (TargetRegisterClass::iterator I = RC->allocation_order_begin(MF),
-         E = RC->allocation_order_end(MF); I != E; ++I)
-      if (*I == Reg) {
-        // We found a matching register class.  Keep looking at others in case
-        // we find one with larger registers that this physreg is also in.
-        FoundRC = RC;
-        FoundVT = ThisVT;
-        break;
-      }
-  }
-  return FoundRC;
-}
-
-
 namespace llvm {
+
 /// AsmOperandInfo - This contains information for each constraint that we are
 /// lowering.
 class LLVM_LIBRARY_VISIBILITY SDISelAsmOperandInfo :
@@ -5041,8 +4983,56 @@ private:
         Regs.insert(*Aliases);
   }
 };
+
 } // end llvm namespace.
 
+/// isAllocatableRegister - If the specified register is safe to allocate,
+/// i.e. it isn't a stack pointer or some other special register, return the
+/// register class for the register.  Otherwise, return null.
+static const TargetRegisterClass *
+isAllocatableRegister(unsigned Reg, MachineFunction &MF,
+                      const TargetLowering &TLI,
+                      const TargetRegisterInfo *TRI) {
+  EVT FoundVT = MVT::Other;
+  const TargetRegisterClass *FoundRC = 0;
+  for (TargetRegisterInfo::regclass_iterator RCI = TRI->regclass_begin(),
+       E = TRI->regclass_end(); RCI != E; ++RCI) {
+    EVT ThisVT = MVT::Other;
+
+    const TargetRegisterClass *RC = *RCI;
+    // If none of the value types for this register class are valid, we
+    // can't use it.  For example, 64-bit reg classes on 32-bit targets.
+    for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end();
+         I != E; ++I) {
+      if (TLI.isTypeLegal(*I)) {
+        // If we have already found this register in a different register class,
+        // choose the one with the largest VT specified.  For example, on
+        // PowerPC, we favor f64 register classes over f32.
+        if (FoundVT == MVT::Other || FoundVT.bitsLT(*I)) {
+          ThisVT = *I;
+          break;
+        }
+      }
+    }
+
+    if (ThisVT == MVT::Other) continue;
+
+    // NOTE: This isn't ideal.  In particular, this might allocate the
+    // frame pointer in functions that need it (due to them not being taken
+    // out of allocation, because a variable sized allocation hasn't been seen
+    // yet).  This is a slight code pessimization, but should still work.
+    for (TargetRegisterClass::iterator I = RC->allocation_order_begin(MF),
+         E = RC->allocation_order_end(MF); I != E; ++I)
+      if (*I == Reg) {
+        // We found a matching register class.  Keep looking at others in case
+        // we find one with larger registers that this physreg is also in.
+        FoundRC = RC;
+        FoundVT = ThisVT;
+        break;
+      }
+  }
+  return FoundRC;
+}
 
 /// GetRegistersForValue - Assign registers (virtual or physical) for the
 /// specified operand.  We prefer to assign virtual registers, to allow the
@@ -5154,7 +5144,7 @@ GetRegistersForValue(SDISelAsmOperandInfo &OpInfo,
       }
     }
 
-    OpInfo.AssignedRegs = RegsForValue(TLI, Regs, RegVT, ValueVT);
+    OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT);
     const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
     OpInfo.MarkAllocatedRegs(isOutReg, isInReg, OutputRegs, InputRegs, *TRI);
     return;
@@ -5172,7 +5162,7 @@ GetRegistersForValue(SDISelAsmOperandInfo &OpInfo,
     for (; NumRegs; --NumRegs)
       Regs.push_back(RegInfo.createVirtualRegister(RC));
 
-    OpInfo.AssignedRegs = RegsForValue(TLI, Regs, RegVT, ValueVT);
+    OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT);
     return;
   }
 
@@ -5215,7 +5205,7 @@ GetRegistersForValue(SDISelAsmOperandInfo &OpInfo,
       for (unsigned i = RegStart; i != RegEnd; ++i)
         Regs.push_back(RegClassRegs[i]);
 
-      OpInfo.AssignedRegs = RegsForValue(TLI, Regs, *RC->vt_begin(),
+      OpInfo.AssignedRegs = RegsForValue(Regs, *RC->vt_begin(),
                                          OpInfo.ConstraintVT);
       OpInfo.MarkAllocatedRegs(isOutReg, isInReg, OutputRegs, InputRegs, *TRI);
       return;
@@ -5332,7 +5322,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     }
 
     // Compute the constraint code and ConstraintType to use.
-    TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, hasMemory, &DAG);
+    TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG);
 
     // If this is a memory input, and if the operand is not indirect, do what we
     // need to to provide an address for the memory input.
@@ -5406,6 +5396,10 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   const MDNode *SrcLoc = CS.getInstruction()->getMetadata("srcloc");
   AsmNodeOperands.push_back(DAG.getMDNode(SrcLoc));
 
+  // Remember the AlignStack bit as operand 3.
+  AsmNodeOperands.push_back(DAG.getTargetConstant(IA->isAlignStack() ? 1 : 0,
+                                            MVT::i1));
+
   // Loop over all of the inputs, copying the operand values into the
   // appropriate registers and processing the output regs.
   RegsForValue RetValRegs;
@@ -5497,7 +5491,6 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
           }
           
           RegsForValue MatchedRegs;
-          MatchedRegs.TLI = &TLI;
           MatchedRegs.ValueVTs.push_back(InOperandVal.getValueType());
           EVT RegVT = AsmNodeOperands[CurOp+1].getValueType();
           MatchedRegs.RegVTs.push_back(RegVT);
@@ -5535,7 +5528,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
         std::vector<SDValue> Ops;
         TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode[0],
-                                         hasMemory, Ops, DAG);
+                                         Ops, DAG);
         if (Ops.empty())
           report_fatal_error("Invalid operand for inline asm constraint '" +
                              Twine(OpInfo.ConstraintCode) + "'!");
@@ -5570,7 +5563,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
       // Copy the input into the appropriate registers.
       if (OpInfo.AssignedRegs.Regs.empty() ||
-          !OpInfo.AssignedRegs.areValueTypesLegal())
+          !OpInfo.AssignedRegs.areValueTypesLegal(TLI))
         report_fatal_error("Couldn't allocate input reg for constraint '" +
                            Twine(OpInfo.ConstraintCode) + "'!");
 
@@ -5595,7 +5588,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   }
 
   // Finish up input operands.  Set the input chain and add the flag last.
-  AsmNodeOperands[0] = Chain;
+  AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
   if (Flag.getNode()) AsmNodeOperands.push_back(Flag);
 
   Chain = DAG.getNode(ISD::INLINEASM, getCurDebugLoc(),
@@ -5606,7 +5599,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   // If this asm returns a register value, copy the result from that register
   // and set it as the value of the call.
   if (!RetValRegs.Regs.empty()) {
-    SDValue Val = RetValRegs.getCopyFromRegs(DAG, getCurDebugLoc(),
+    SDValue Val = RetValRegs.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(),
                                              Chain, &Flag);
 
     // FIXME: Why don't we do this for inline asms with MRVs?
@@ -5646,7 +5639,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   for (unsigned i = 0, e = IndirectStoresToEmit.size(); i != e; ++i) {
     RegsForValue &OutRegs = IndirectStoresToEmit[i].first;
     const Value *Ptr = IndirectStoresToEmit[i].second;
-    SDValue OutVal = OutRegs.getCopyFromRegs(DAG, getCurDebugLoc(),
+    SDValue OutVal = OutRegs.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(),
                                              Chain, &Flag);
     StoresToEmit.push_back(std::make_pair(OutVal, Ptr));
   }
@@ -5672,14 +5665,16 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 void SelectionDAGBuilder::visitVAStart(const CallInst &I) {
   DAG.setRoot(DAG.getNode(ISD::VASTART, getCurDebugLoc(),
                           MVT::Other, getRoot(),
-                          getValue(I.getOperand(1)),
-                          DAG.getSrcValue(I.getOperand(1))));
+                          getValue(I.getArgOperand(0)),
+                          DAG.getSrcValue(I.getArgOperand(0))));
 }
 
 void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
+  const TargetData &TD = *TLI.getTargetData();
   SDValue V = DAG.getVAArg(TLI.getValueType(I.getType()), getCurDebugLoc(),
                            getRoot(), getValue(I.getOperand(0)),
-                           DAG.getSrcValue(I.getOperand(0)));
+                           DAG.getSrcValue(I.getOperand(0)),
+                           TD.getABITypeAlignment(I.getType()));
   setValue(&I, V);
   DAG.setRoot(V.getValue(1));
 }
@@ -5687,17 +5682,17 @@ void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
 void SelectionDAGBuilder::visitVAEnd(const CallInst &I) {
   DAG.setRoot(DAG.getNode(ISD::VAEND, getCurDebugLoc(),
                           MVT::Other, getRoot(),
-                          getValue(I.getOperand(1)),
-                          DAG.getSrcValue(I.getOperand(1))));
+                          getValue(I.getArgOperand(0)),
+                          DAG.getSrcValue(I.getArgOperand(0))));
 }
 
 void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
   DAG.setRoot(DAG.getNode(ISD::VACOPY, getCurDebugLoc(),
                           MVT::Other, getRoot(),
-                          getValue(I.getOperand(1)),
-                          getValue(I.getOperand(2)),
-                          DAG.getSrcValue(I.getOperand(1)),
-                          DAG.getSrcValue(I.getOperand(2))));
+                          getValue(I.getArgOperand(0)),
+                          getValue(I.getArgOperand(1)),
+                          DAG.getSrcValue(I.getArgOperand(0)),
+                          DAG.getSrcValue(I.getArgOperand(1))));
 }
 
 /// TargetLowering::LowerCallTo - This is the default LowerCallTo
@@ -5715,6 +5710,7 @@ TargetLowering::LowerCallTo(SDValue Chain, const Type *RetTy,
                             DebugLoc dl) const {
   // Handle all of the outgoing arguments.
   SmallVector<ISD::OutputArg, 32> Outs;
+  SmallVector<SDValue, 32> OutVals;
   for (unsigned i = 0, e = Args.size(); i != e; ++i) {
     SmallVector<EVT, 4> ValueVTs;
     ComputeValueVTs(*this, Args[i].Ty, ValueVTs);
@@ -5768,13 +5764,15 @@ TargetLowering::LowerCallTo(SDValue Chain, const Type *RetTy,
 
       for (unsigned j = 0; j != NumParts; ++j) {
         // if it isn't first piece, alignment must be 1
-        ISD::OutputArg MyFlags(Flags, Parts[j], i < NumFixedArgs);
+        ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(),
+                               i < NumFixedArgs);
         if (NumParts > 1 && j == 0)
           MyFlags.Flags.setSplit();
         else if (j != 0)
           MyFlags.Flags.setOrigAlign(1);
 
         Outs.push_back(MyFlags);
+        OutVals.push_back(Parts[j]);
       }
     }
   }
@@ -5803,7 +5801,7 @@ TargetLowering::LowerCallTo(SDValue Chain, const Type *RetTy,
 
   SmallVector<SDValue, 4> InVals;
   Chain = LowerCall(Chain, Callee, CallConv, isVarArg, isTailCall,
-                    Outs, Ins, dl, DAG, InVals);
+                    Outs, OutVals, Ins, dl, DAG, InVals);
 
   // Verify that the target's LowerCall behaved as expected.
   assert(Chain.getNode() && Chain.getValueType() == MVT::Other &&
@@ -5876,7 +5874,7 @@ SDValue TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 
 void
 SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
-  SDValue Op = getValue(V);
+  SDValue Op = getNonRegisterValue(V);
   assert((Op.getOpcode() != ISD::CopyFromReg ||
           cast<RegisterSDNode>(Op.getOperand(1))->getReg() != Reg) &&
          "Copy from a reg to the same reg!");
@@ -5894,21 +5892,16 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
   // If this is the entry block, emit arguments.
   const Function &F = *LLVMBB->getParent();
   SelectionDAG &DAG = SDB->DAG;
-  SDValue OldRoot = DAG.getRoot();
   DebugLoc dl = SDB->getCurDebugLoc();
   const TargetData *TD = TLI.getTargetData();
   SmallVector<ISD::InputArg, 16> Ins;
 
   // Check whether the function can return without sret-demotion.
-  SmallVector<EVT, 4> OutVTs;
-  SmallVector<ISD::ArgFlagsTy, 4> OutsFlags;
-  getReturnInfo(F.getReturnType(), F.getAttributes().getRetAttributes(),
-                OutVTs, OutsFlags, TLI);
-  FunctionLoweringInfo &FLI = DAG.getFunctionLoweringInfo();
-
-  FLI.CanLowerReturn = TLI.CanLowerReturn(F.getCallingConv(), F.isVarArg(),
-                                          OutVTs, OutsFlags, DAG);
-  if (!FLI.CanLowerReturn) {
+  SmallVector<ISD::OutputArg, 4> Outs;
+  GetReturnInfo(F.getReturnType(), F.getAttributes().getRetAttributes(),
+                Outs, TLI);
+
+  if (!FuncInfo->CanLowerReturn) {
     // Put in an sret pointer parameter before all the other parameters.
     SmallVector<EVT, 1> ValueVTs;
     ComputeValueVTs(TLI, PointerType::getUnqual(F.getReturnType()), ValueVTs);
@@ -6002,7 +5995,7 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
   // Set up the argument values.
   unsigned i = 0;
   Idx = 1;
-  if (!FLI.CanLowerReturn) {
+  if (!FuncInfo->CanLowerReturn) {
     // Create a virtual register for the sret pointer, and put in a copy
     // from the sret argument into it.
     SmallVector<EVT, 1> ValueVTs;
@@ -6016,7 +6009,7 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
     MachineFunction& MF = SDB->DAG.getMachineFunction();
     MachineRegisterInfo& RegInfo = MF.getRegInfo();
     unsigned SRetReg = RegInfo.createVirtualRegister(TLI.getRegClassFor(RegVT));
-    FLI.DemoteRegister = SRetReg;
+    FuncInfo->DemoteRegister = SRetReg;
     NewRoot = SDB->DAG.getCopyToReg(NewRoot, SDB->getCurDebugLoc(),
                                     SRetReg, ArgValue);
     DAG.setRoot(NewRoot);
@@ -6032,6 +6025,12 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
     SmallVector<EVT, 4> ValueVTs;
     ComputeValueVTs(TLI, I->getType(), ValueVTs);
     unsigned NumValues = ValueVTs.size();
+
+    // If this argument is unused then remember its value. It is used to generate
+    // debugging information.
+    if (I->use_empty() && NumValues)
+      SDB->setUnusedArgValue(I, InVals[i]);
+
     for (unsigned Value = 0; Value != NumValues; ++Value) {
       EVT VT = ValueVTs[Value];
       EVT PartVT = TLI.getRegisterType(*CurDAG->getContext(), VT);
@@ -6112,17 +6111,20 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
       if (const Constant *C = dyn_cast<Constant>(PHIOp)) {
         unsigned &RegOut = ConstantsOut[C];
         if (RegOut == 0) {
-          RegOut = FuncInfo.CreateRegForValue(C);
+          RegOut = FuncInfo.CreateRegs(C->getType());
           CopyValueToVirtualRegister(C, RegOut);
         }
         Reg = RegOut;
       } else {
-        Reg = FuncInfo.ValueMap[PHIOp];
-        if (Reg == 0) {
+        DenseMap<const Value *, unsigned>::iterator I =
+          FuncInfo.ValueMap.find(PHIOp);
+        if (I != FuncInfo.ValueMap.end())
+          Reg = I->second;
+        else {
           assert(isa<AllocaInst>(PHIOp) &&
                  FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(PHIOp)) &&
                  "Didn't codegen value into a register!??");
-          Reg = FuncInfo.CreateRegForValue(PHIOp);
+          Reg = FuncInfo.CreateRegs(PHIOp->getType());
           CopyValueToVirtualRegister(PHIOp, Reg);
         }
       }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 3fcd4b9dc437..46733d6db124 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -88,6 +88,10 @@ class SelectionDAGBuilder {
   DebugLoc CurDebugLoc;
 
   DenseMap<const Value*, SDValue> NodeMap;
+  
+  /// UnusedArgNodeMap - Maps argument value for unused arguments. This is used
+  /// to preserve debug information for incoming arguments.
+  DenseMap<const Value*, SDValue> UnusedArgNodeMap;
 
 public:
   /// PendingLoads - Loads are not emitted to the program immediately.  We bunch
@@ -342,6 +346,8 @@ public:
   void visit(unsigned Opcode, const User &I);
 
   SDValue getValue(const Value *V);
+  SDValue getNonRegisterValue(const Value *V);
+  SDValue getValueImpl(const Value *V);
 
   void setValue(const Value *V, SDValue NewN) {
     SDValue &N = NodeMap[V];
@@ -349,6 +355,12 @@ public:
     N = NewN;
   }
   
+  void setUnusedArgValue(const Value *V, SDValue NewN) {
+    SDValue &N = UnusedArgNodeMap[V];
+    assert(N.getNode() == 0 && "Already set a value for this node!");
+    N = NewN;
+  }
+  
   void GetRegistersForValue(SDISelAsmOperandInfo &OpInfo,
                             std::set<unsigned> &OutputRegs, 
                             std::set<unsigned> &InputRegs);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 65b8d4f65919..08ba5482f7d2 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -14,7 +14,7 @@
 #define DEBUG_TYPE "isel"
 #include "ScheduleDAGSDNodes.h"
 #include "SelectionDAGBuilder.h"
-#include "FunctionLoweringInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/DebugInfo.h"
@@ -171,7 +171,7 @@ TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 SelectionDAGISel::SelectionDAGISel(const TargetMachine &tm, CodeGenOpt::Level OL) :
   MachineFunctionPass(&ID), TM(tm), TLI(*tm.getTargetLowering()),
   FuncInfo(new FunctionLoweringInfo(TLI)),
-  CurDAG(new SelectionDAG(tm, *FuncInfo)),
+  CurDAG(new SelectionDAG(tm)),
   SDB(new SelectionDAGBuilder(*CurDAG, *FuncInfo, OL)),
   GFI(),
   OptLevel(OL),
@@ -244,7 +244,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
   CurDAG->init(*MF);
-  FuncInfo->set(Fn, *MF, EnableFastISel);
+  FuncInfo->set(Fn, *MF);
   SDB->init(GFI, *AA);
 
   SelectAllBasicBlocks(Fn);
@@ -300,7 +300,11 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       for (MachineBasicBlock::const_iterator
              II = MBB->begin(), IE = MBB->end(); II != IE; ++II) {
         const TargetInstrDesc &TID = TM.getInstrInfo()->get(II->getOpcode());
-        if (II->isInlineAsm() || (TID.isCall() && !TID.isReturn())) {
+
+        // Operand 1 of an inline asm instruction indicates whether the asm
+        // needs stack or not.
+        if ((II->isInlineAsm() && II->getOperand(1).getImm()) ||
+            (TID.isCall() && !TID.isReturn())) {
           MFI->setHasCalls(true);
           goto done;
         }
@@ -312,6 +316,26 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   // Determine if there is a call to setjmp in the machine function.
   MF->setCallsSetJmp(FunctionCallsSetJmp(&Fn));
 
+  // Replace forward-declared registers with the registers containing
+  // the desired value.
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  for (DenseMap<unsigned, unsigned>::iterator
+       I = FuncInfo->RegFixups.begin(), E = FuncInfo->RegFixups.end();
+       I != E; ++I) {
+    unsigned From = I->first;
+    unsigned To = I->second;
+    // If To is also scheduled to be replaced, find what its ultimate
+    // replacement is.
+    for (;;) {
+      DenseMap<unsigned, unsigned>::iterator J =
+        FuncInfo->RegFixups.find(To);
+      if (J == E) break;
+      To = J->second;
+    }
+    // Replace it.
+    MRI.replaceRegWith(From, To);
+  }
+
   // Release function-specific state. SDB and CurDAG are already cleared
   // at this point.
   FuncInfo->clear();
@@ -319,10 +343,8 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   return true;
 }
 
-MachineBasicBlock *
-SelectionDAGISel::SelectBasicBlock(MachineBasicBlock *BB,
-                                   const BasicBlock *LLVMBB,
-                                   BasicBlock::const_iterator Begin,
+void
+SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin,
                                    BasicBlock::const_iterator End,
                                    bool &HadTailCall) {
   // Lower all of the non-terminator instructions. If a call is emitted
@@ -337,7 +359,7 @@ SelectionDAGISel::SelectBasicBlock(MachineBasicBlock *BB,
   SDB->clear();
 
   // Final step, emit the lowered DAG as machine code.
-  return CodeGenAndEmitDAG(BB);
+  CodeGenAndEmitDAG();
 }
 
 namespace {
@@ -372,102 +394,6 @@ public:
 };
 }
 
-/// TrivialTruncElim - Eliminate some trivial nops that can result from
-/// ShrinkDemandedOps: (trunc (ext n)) -> n.
-static bool TrivialTruncElim(SDValue Op,
-                             TargetLowering::TargetLoweringOpt &TLO) {
-  SDValue N0 = Op.getOperand(0);
-  EVT VT = Op.getValueType();
-  if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
-       N0.getOpcode() == ISD::SIGN_EXTEND ||
-       N0.getOpcode() == ISD::ANY_EXTEND) &&
-      N0.getOperand(0).getValueType() == VT) {
-    return TLO.CombineTo(Op, N0.getOperand(0));
-  }
-  return false;
-}
-
-/// ShrinkDemandedOps - A late transformation pass that shrink expressions
-/// using TargetLowering::TargetLoweringOpt::ShrinkDemandedOp. It converts
-/// x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free.
-void SelectionDAGISel::ShrinkDemandedOps() {
-  SmallVector<SDNode*, 128> Worklist;
-  SmallPtrSet<SDNode*, 128> InWorklist;
-
-  // Add all the dag nodes to the worklist.
-  Worklist.reserve(CurDAG->allnodes_size());
-  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
-       E = CurDAG->allnodes_end(); I != E; ++I) {
-    Worklist.push_back(I);
-    InWorklist.insert(I);
-  }
-
-  TargetLowering::TargetLoweringOpt TLO(*CurDAG, true, true, true);
-  while (!Worklist.empty()) {
-    SDNode *N = Worklist.pop_back_val();
-    InWorklist.erase(N);
-
-    if (N->use_empty() && N != CurDAG->getRoot().getNode()) {
-      // Deleting this node may make its operands dead, add them to the worklist
-      // if they aren't already there.
-      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-        if (InWorklist.insert(N->getOperand(i).getNode()))
-          Worklist.push_back(N->getOperand(i).getNode());
-      
-      CurDAG->DeleteNode(N);
-      continue;
-    }
-
-    // Run ShrinkDemandedOp on scalar binary operations.
-    if (N->getNumValues() != 1 ||
-        !N->getValueType(0).isSimple() || !N->getValueType(0).isInteger())
-      continue;
-    
-    unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits();
-    APInt Demanded = APInt::getAllOnesValue(BitWidth);
-    APInt KnownZero, KnownOne;
-    if (!TLI.SimplifyDemandedBits(SDValue(N, 0), Demanded,
-                                  KnownZero, KnownOne, TLO) &&
-        (N->getOpcode() != ISD::TRUNCATE ||
-         !TrivialTruncElim(SDValue(N, 0), TLO)))
-      continue;
-    
-    // Revisit the node.
-    assert(!InWorklist.count(N) && "Already in worklist");
-    Worklist.push_back(N);
-    InWorklist.insert(N);
-
-    // Replace the old value with the new one.
-    DEBUG(errs() << "\nShrinkDemandedOps replacing "; 
-          TLO.Old.getNode()->dump(CurDAG);
-          errs() << "\nWith: ";
-          TLO.New.getNode()->dump(CurDAG);
-          errs() << '\n');
-
-    if (InWorklist.insert(TLO.New.getNode()))
-      Worklist.push_back(TLO.New.getNode());
-
-    SDOPsWorkListRemover DeadNodes(Worklist, InWorklist);
-    CurDAG->ReplaceAllUsesOfValueWith(TLO.Old, TLO.New, &DeadNodes);
-
-    if (!TLO.Old.getNode()->use_empty()) continue;
-        
-    for (unsigned i = 0, e = TLO.Old.getNode()->getNumOperands();
-         i != e; ++i) {
-      SDNode *OpNode = TLO.Old.getNode()->getOperand(i).getNode(); 
-      if (OpNode->hasOneUse()) {
-        // Add OpNode to the end of the list to revisit.
-        DeadNodes.RemoveFromWorklist(OpNode);
-        Worklist.push_back(OpNode);
-        InWorklist.insert(OpNode);
-      }
-    }
-
-    DeadNodes.RemoveFromWorklist(TLO.Old.getNode());
-    CurDAG->DeleteNode(TLO.Old.getNode());
-  }
-}
-
 void SelectionDAGISel::ComputeLiveOutVRegInfo() {
   SmallPtrSet<SDNode*, 128> VisitedNodes;
   SmallVector<SDNode*, 128> Worklist;
@@ -522,7 +448,7 @@ void SelectionDAGISel::ComputeLiveOutVRegInfo() {
   } while (!Worklist.empty());
 }
 
-MachineBasicBlock *SelectionDAGISel::CodeGenAndEmitDAG(MachineBasicBlock *BB) {
+void SelectionDAGISel::CodeGenAndEmitDAG() {
   std::string GroupName;
   if (TimePassesIsEnabled)
     GroupName = "Instruction Selection and Scheduling";
@@ -531,23 +457,19 @@ MachineBasicBlock *SelectionDAGISel::CodeGenAndEmitDAG(MachineBasicBlock *BB) {
       ViewDAGCombine2 || ViewDAGCombineLT || ViewISelDAGs || ViewSchedDAGs ||
       ViewSUnitDAGs)
     BlockName = MF->getFunction()->getNameStr() + ":" +
-                BB->getBasicBlock()->getNameStr();
+                FuncInfo->MBB->getBasicBlock()->getNameStr();
 
-  DEBUG(dbgs() << "Initial selection DAG:\n");
-  DEBUG(CurDAG->dump());
+  DEBUG(dbgs() << "Initial selection DAG:\n"; CurDAG->dump());
 
   if (ViewDAGCombine1) CurDAG->viewGraph("dag-combine1 input for " + BlockName);
 
   // Run the DAG combiner in pre-legalize mode.
-  if (TimePassesIsEnabled) {
-    NamedRegionTimer T("DAG Combining 1", GroupName);
-    CurDAG->Combine(Unrestricted, *AA, OptLevel);
-  } else {
+  {
+    NamedRegionTimer T("DAG Combining 1", GroupName, TimePassesIsEnabled);
     CurDAG->Combine(Unrestricted, *AA, OptLevel);
   }
 
-  DEBUG(dbgs() << "Optimized lowered selection DAG:\n");
-  DEBUG(CurDAG->dump());
+  DEBUG(dbgs() << "Optimized lowered selection DAG:\n"; CurDAG->dump());
 
   // Second step, hack on the DAG until it only uses operations and types that
   // the target supports.
@@ -555,44 +477,36 @@ MachineBasicBlock *SelectionDAGISel::CodeGenAndEmitDAG(MachineBasicBlock *BB) {
                                                BlockName);
 
   bool Changed;
-  if (TimePassesIsEnabled) {
-    NamedRegionTimer T("Type Legalization", GroupName);
-    Changed = CurDAG->LegalizeTypes();
-  } else {
+  {
+    NamedRegionTimer T("Type Legalization", GroupName, TimePassesIsEnabled);
     Changed = CurDAG->LegalizeTypes();
   }
 
-  DEBUG(dbgs() << "Type-legalized selection DAG:\n");
-  DEBUG(CurDAG->dump());
+  DEBUG(dbgs() << "Type-legalized selection DAG:\n"; CurDAG->dump());
 
   if (Changed) {
     if (ViewDAGCombineLT)
       CurDAG->viewGraph("dag-combine-lt input for " + BlockName);
 
     // Run the DAG combiner in post-type-legalize mode.
-    if (TimePassesIsEnabled) {
-      NamedRegionTimer T("DAG Combining after legalize types", GroupName);
-      CurDAG->Combine(NoIllegalTypes, *AA, OptLevel);
-    } else {
+    {
+      NamedRegionTimer T("DAG Combining after legalize types", GroupName,
+                         TimePassesIsEnabled);
       CurDAG->Combine(NoIllegalTypes, *AA, OptLevel);
     }
 
-    DEBUG(dbgs() << "Optimized type-legalized selection DAG:\n");
-    DEBUG(CurDAG->dump());
+    DEBUG(dbgs() << "Optimized type-legalized selection DAG:\n";
+          CurDAG->dump());
   }
 
-  if (TimePassesIsEnabled) {
-    NamedRegionTimer T("Vector Legalization", GroupName);
-    Changed = CurDAG->LegalizeVectors();
-  } else {
+  {
+    NamedRegionTimer T("Vector Legalization", GroupName, TimePassesIsEnabled);
     Changed = CurDAG->LegalizeVectors();
   }
 
   if (Changed) {
-    if (TimePassesIsEnabled) {
-      NamedRegionTimer T("Type Legalization 2", GroupName);
-      CurDAG->LegalizeTypes();
-    } else {
+    {
+      NamedRegionTimer T("Type Legalization 2", GroupName, TimePassesIsEnabled);
       CurDAG->LegalizeTypes();
     }
 
@@ -600,95 +514,79 @@ MachineBasicBlock *SelectionDAGISel::CodeGenAndEmitDAG(MachineBasicBlock *BB) {
       CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
 
     // Run the DAG combiner in post-type-legalize mode.
-    if (TimePassesIsEnabled) {
-      NamedRegionTimer T("DAG Combining after legalize vectors", GroupName);
-      CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
-    } else {
+    {
+      NamedRegionTimer T("DAG Combining after legalize vectors", GroupName,
+                         TimePassesIsEnabled);
       CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
     }
 
-    DEBUG(dbgs() << "Optimized vector-legalized selection DAG:\n");
-    DEBUG(CurDAG->dump());
+    DEBUG(dbgs() << "Optimized vector-legalized selection DAG:\n";
+          CurDAG->dump());
   }
 
   if (ViewLegalizeDAGs) CurDAG->viewGraph("legalize input for " + BlockName);
 
-  if (TimePassesIsEnabled) {
-    NamedRegionTimer T("DAG Legalization", GroupName);
-    CurDAG->Legalize(OptLevel);
-  } else {
+  {
+    NamedRegionTimer T("DAG Legalization", GroupName, TimePassesIsEnabled);
     CurDAG->Legalize(OptLevel);
   }
 
-  DEBUG(dbgs() << "Legalized selection DAG:\n");
-  DEBUG(CurDAG->dump());
+  DEBUG(dbgs() << "Legalized selection DAG:\n"; CurDAG->dump());
 
   if (ViewDAGCombine2) CurDAG->viewGraph("dag-combine2 input for " + BlockName);
 
   // Run the DAG combiner in post-legalize mode.
-  if (TimePassesIsEnabled) {
-    NamedRegionTimer T("DAG Combining 2", GroupName);
-    CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
-  } else {
+  {
+    NamedRegionTimer T("DAG Combining 2", GroupName, TimePassesIsEnabled);
     CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
   }
 
-  DEBUG(dbgs() << "Optimized legalized selection DAG:\n");
-  DEBUG(CurDAG->dump());
+  DEBUG(dbgs() << "Optimized legalized selection DAG:\n"; CurDAG->dump());
 
-  if (OptLevel != CodeGenOpt::None) {
-    ShrinkDemandedOps();
+  if (OptLevel != CodeGenOpt::None)
     ComputeLiveOutVRegInfo();
-  }
 
   if (ViewISelDAGs) CurDAG->viewGraph("isel input for " + BlockName);
 
   // Third, instruction select all of the operations to machine code, adding the
   // code to the MachineBasicBlock.
-  if (TimePassesIsEnabled) {
-    NamedRegionTimer T("Instruction Selection", GroupName);
-    DoInstructionSelection();
-  } else {
+  {
+    NamedRegionTimer T("Instruction Selection", GroupName, TimePassesIsEnabled);
     DoInstructionSelection();
   }
 
-  DEBUG(dbgs() << "Selected selection DAG:\n");
-  DEBUG(CurDAG->dump());
+  DEBUG(dbgs() << "Selected selection DAG:\n"; CurDAG->dump());
 
   if (ViewSchedDAGs) CurDAG->viewGraph("scheduler input for " + BlockName);
 
   // Schedule machine code.
   ScheduleDAGSDNodes *Scheduler = CreateScheduler();
-  if (TimePassesIsEnabled) {
-    NamedRegionTimer T("Instruction Scheduling", GroupName);
-    Scheduler->Run(CurDAG, BB, BB->end());
-  } else {
-    Scheduler->Run(CurDAG, BB, BB->end());
+  {
+    NamedRegionTimer T("Instruction Scheduling", GroupName,
+                       TimePassesIsEnabled);
+    Scheduler->Run(CurDAG, FuncInfo->MBB, FuncInfo->InsertPt);
   }
 
   if (ViewSUnitDAGs) Scheduler->viewGraph();
 
   // Emit machine code to BB.  This can change 'BB' to the last block being
   // inserted into.
-  if (TimePassesIsEnabled) {
-    NamedRegionTimer T("Instruction Creation", GroupName);
-    BB = Scheduler->EmitSchedule();
-  } else {
-    BB = Scheduler->EmitSchedule();
+  {
+    NamedRegionTimer T("Instruction Creation", GroupName, TimePassesIsEnabled);
+
+    FuncInfo->MBB = Scheduler->EmitSchedule();
+    FuncInfo->InsertPt = Scheduler->InsertPos;
   }
 
   // Free the scheduler state.
-  if (TimePassesIsEnabled) {
-    NamedRegionTimer T("Instruction Scheduling Cleanup", GroupName);
-    delete Scheduler;
-  } else {
+  {
+    NamedRegionTimer T("Instruction Scheduling Cleanup", GroupName,
+                       TimePassesIsEnabled);
     delete Scheduler;
   }
 
   // Free the SelectionDAG state, now that we're finished with it.
   CurDAG->clear();
-
-  return BB;
 }
 
 void SelectionDAGISel::DoInstructionSelection() {
@@ -750,21 +648,22 @@ void SelectionDAGISel::DoInstructionSelection() {
 
 /// PrepareEHLandingPad - Emit an EH_LABEL, set up live-in registers, and
 /// do other setup for EH landing-pad blocks.
-void SelectionDAGISel::PrepareEHLandingPad(MachineBasicBlock *BB) {
+void SelectionDAGISel::PrepareEHLandingPad() {
   // Add a label to mark the beginning of the landing pad.  Deletion of the
   // landing pad can thus be detected via the MachineModuleInfo.
-  MCSymbol *Label = MF->getMMI().addLandingPad(BB);
+  MCSymbol *Label = MF->getMMI().addLandingPad(FuncInfo->MBB);
 
   const TargetInstrDesc &II = TM.getInstrInfo()->get(TargetOpcode::EH_LABEL);
-  BuildMI(BB, SDB->getCurDebugLoc(), II).addSym(Label);
+  BuildMI(*FuncInfo->MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II)
+    .addSym(Label);
 
   // Mark exception register as live in.
   unsigned Reg = TLI.getExceptionAddressRegister();
-  if (Reg) BB->addLiveIn(Reg);
+  if (Reg) FuncInfo->MBB->addLiveIn(Reg);
 
   // Mark exception selector register as live in.
   Reg = TLI.getExceptionSelectorRegister();
-  if (Reg) BB->addLiveIn(Reg);
+  if (Reg) FuncInfo->MBB->addLiveIn(Reg);
 
   // FIXME: Hack around an exception handling flaw (PR1508): the personality
   // function and list of typeids logically belong to the invoke (or, if you
@@ -777,7 +676,7 @@ void SelectionDAGISel::PrepareEHLandingPad(MachineBasicBlock *BB) {
   // in exceptions not being caught because no typeids are associated with
   // the invoke.  This may not be the only way things can go wrong, but it
   // is the only way we try to work around for the moment.
-  const BasicBlock *LLVMBB = BB->getBasicBlock();
+  const BasicBlock *LLVMBB = FuncInfo->MBB->getBasicBlock();
   const BranchInst *Br = dyn_cast<BranchInst>(LLVMBB->getTerminator());
 
   if (Br && Br->isUnconditional()) { // Critical edge?
@@ -796,83 +695,100 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   // Initialize the Fast-ISel state, if needed.
   FastISel *FastIS = 0;
   if (EnableFastISel)
-    FastIS = TLI.createFastISel(*MF, FuncInfo->ValueMap, FuncInfo->MBBMap,
-                                FuncInfo->StaticAllocaMap,
-                                FuncInfo->PHINodesToUpdate
-#ifndef NDEBUG
-                                , FuncInfo->CatchInfoLost
-#endif
-                                );
+    FastIS = TLI.createFastISel(*FuncInfo);
 
   // Iterate over all basic blocks in the function.
   for (Function::const_iterator I = Fn.begin(), E = Fn.end(); I != E; ++I) {
     const BasicBlock *LLVMBB = &*I;
-    MachineBasicBlock *BB = FuncInfo->MBBMap[LLVMBB];
+    FuncInfo->MBB = FuncInfo->MBBMap[LLVMBB];
+    FuncInfo->InsertPt = FuncInfo->MBB->getFirstNonPHI();
 
     BasicBlock::const_iterator const Begin = LLVMBB->getFirstNonPHI();
     BasicBlock::const_iterator const End = LLVMBB->end();
-    BasicBlock::const_iterator BI = Begin;
+    BasicBlock::const_iterator BI = End;
 
+    FuncInfo->InsertPt = FuncInfo->MBB->getFirstNonPHI();
+
+    // Setup an EH landing-pad block.
+    if (FuncInfo->MBB->isLandingPad())
+      PrepareEHLandingPad();
+    
     // Lower any arguments needed in this block if this is the entry block.
     if (LLVMBB == &Fn.getEntryBlock())
       LowerArguments(LLVMBB);
 
-    // Setup an EH landing-pad block.
-    if (BB->isLandingPad())
-      PrepareEHLandingPad(BB);
-    
     // Before doing SelectionDAG ISel, see if FastISel has been requested.
     if (FastIS) {
+      FastIS->startNewBlock();
+
       // Emit code for any incoming arguments. This must happen before
       // beginning FastISel on the entry block.
       if (LLVMBB == &Fn.getEntryBlock()) {
         CurDAG->setRoot(SDB->getControlRoot());
         SDB->clear();
-        BB = CodeGenAndEmitDAG(BB);
+        CodeGenAndEmitDAG();
+
+        // If we inserted any instructions at the beginning, make a note of
+        // where they are, so we can be sure to emit subsequent instructions
+        // after them.
+        if (FuncInfo->InsertPt != FuncInfo->MBB->begin())
+          FastIS->setLastLocalValue(llvm::prior(FuncInfo->InsertPt));
+        else
+          FastIS->setLastLocalValue(0);
       }
-      FastIS->startNewBlock(BB);
+
       // Do FastISel on as many instructions as possible.
-      for (; BI != End; ++BI) {
+      for (; BI != Begin; --BI) {
+        const Instruction *Inst = llvm::prior(BI);
+
+        // If we no longer require this instruction, skip it.
+        if (!Inst->mayWriteToMemory() &&
+            !isa<TerminatorInst>(Inst) &&
+            !isa<DbgInfoIntrinsic>(Inst) &&
+            !FuncInfo->isExportedInst(Inst))
+          continue;
+
+        // Bottom-up: reset the insert pos at the top, after any local-value
+        // instructions.
+        FastIS->recomputeInsertPt();
+
         // Try to select the instruction with FastISel.
-        if (FastIS->SelectInstruction(BI))
+        if (FastIS->SelectInstruction(Inst))
           continue;
 
         // Then handle certain instructions as single-LLVM-Instruction blocks.
-        if (isa<CallInst>(BI)) {
+        if (isa<CallInst>(Inst)) {
           ++NumFastIselFailures;
           if (EnableFastISelVerbose || EnableFastISelAbort) {
             dbgs() << "FastISel missed call: ";
-            BI->dump();
+            Inst->dump();
           }
 
-          if (!BI->getType()->isVoidTy() && !BI->use_empty()) {
-            unsigned &R = FuncInfo->ValueMap[BI];
+          if (!Inst->getType()->isVoidTy() && !Inst->use_empty()) {
+            unsigned &R = FuncInfo->ValueMap[Inst];
             if (!R)
-              R = FuncInfo->CreateRegForValue(BI);
+              R = FuncInfo->CreateRegs(Inst->getType());
           }
 
           bool HadTailCall = false;
-          BB = SelectBasicBlock(BB, LLVMBB, BI, llvm::next(BI), HadTailCall);
+          SelectBasicBlock(Inst, BI, HadTailCall);
 
           // If the call was emitted as a tail call, we're done with the block.
           if (HadTailCall) {
-            BI = End;
+            --BI;
             break;
           }
 
-          // If the instruction was codegen'd with multiple blocks,
-          // inform the FastISel object where to resume inserting.
-          FastIS->setCurrentBlock(BB);
           continue;
         }
 
         // Otherwise, give up on FastISel for the rest of the block.
         // For now, be a little lenient about non-branch terminators.
-        if (!isa<TerminatorInst>(BI) || isa<BranchInst>(BI)) {
+        if (!isa<TerminatorInst>(Inst) || isa<BranchInst>(Inst)) {
           ++NumFastIselFailures;
           if (EnableFastISelVerbose || EnableFastISelAbort) {
             dbgs() << "FastISel miss: ";
-            BI->dump();
+            Inst->dump();
           }
           if (EnableFastISelAbort)
             // The "fast" selector couldn't handle something and bailed.
@@ -881,17 +797,17 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
         }
         break;
       }
+
+      FastIS->recomputeInsertPt();
     }
 
     // Run SelectionDAG instruction selection on the remainder of the block
     // not handled by FastISel. If FastISel is not run, this is the entire
     // block.
-    if (BI != End) {
-      bool HadTailCall;
-      BB = SelectBasicBlock(BB, LLVMBB, BI, End, HadTailCall);
-    }
+    bool HadTailCall;
+    SelectBasicBlock(Begin, BI, HadTailCall);
 
-    FinishBasicBlock(BB);
+    FinishBasicBlock();
     FuncInfo->PHINodesToUpdate.clear();
   }
 
@@ -899,11 +815,11 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 }
 
 void
-SelectionDAGISel::FinishBasicBlock(MachineBasicBlock *BB) {
+SelectionDAGISel::FinishBasicBlock() {
 
   DEBUG(dbgs() << "Total amount of phi nodes to update: "
-               << FuncInfo->PHINodesToUpdate.size() << "\n");
-  DEBUG(for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i)
+               << FuncInfo->PHINodesToUpdate.size() << "\n";
+        for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i)
           dbgs() << "Node " << i << " : ("
                  << FuncInfo->PHINodesToUpdate[i].first
                  << ", " << FuncInfo->PHINodesToUpdate[i].second << ")\n");
@@ -917,11 +833,11 @@ SelectionDAGISel::FinishBasicBlock(MachineBasicBlock *BB) {
       MachineInstr *PHI = FuncInfo->PHINodesToUpdate[i].first;
       assert(PHI->isPHI() &&
              "This is not a machine PHI node that we are updating!");
-      if (!BB->isSuccessor(PHI->getParent()))
+      if (!FuncInfo->MBB->isSuccessor(PHI->getParent()))
         continue;
       PHI->addOperand(
         MachineOperand::CreateReg(FuncInfo->PHINodesToUpdate[i].second, false));
-      PHI->addOperand(MachineOperand::CreateMBB(BB));
+      PHI->addOperand(MachineOperand::CreateMBB(FuncInfo->MBB));
     }
     return;
   }
@@ -930,33 +846,35 @@ SelectionDAGISel::FinishBasicBlock(MachineBasicBlock *BB) {
     // Lower header first, if it wasn't already lowered
     if (!SDB->BitTestCases[i].Emitted) {
       // Set the current basic block to the mbb we wish to insert the code into
-      BB = SDB->BitTestCases[i].Parent;
+      FuncInfo->MBB = SDB->BitTestCases[i].Parent;
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
       // Emit the code
-      SDB->visitBitTestHeader(SDB->BitTestCases[i], BB);
+      SDB->visitBitTestHeader(SDB->BitTestCases[i], FuncInfo->MBB);
       CurDAG->setRoot(SDB->getRoot());
       SDB->clear();
-      BB = CodeGenAndEmitDAG(BB);
+      CodeGenAndEmitDAG();
     }
 
     for (unsigned j = 0, ej = SDB->BitTestCases[i].Cases.size(); j != ej; ++j) {
       // Set the current basic block to the mbb we wish to insert the code into
-      BB = SDB->BitTestCases[i].Cases[j].ThisBB;
+      FuncInfo->MBB = SDB->BitTestCases[i].Cases[j].ThisBB;
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
       // Emit the code
       if (j+1 != ej)
         SDB->visitBitTestCase(SDB->BitTestCases[i].Cases[j+1].ThisBB,
                               SDB->BitTestCases[i].Reg,
                               SDB->BitTestCases[i].Cases[j],
-                              BB);
+                              FuncInfo->MBB);
       else
         SDB->visitBitTestCase(SDB->BitTestCases[i].Default,
                               SDB->BitTestCases[i].Reg,
                               SDB->BitTestCases[i].Cases[j],
-                              BB);
+                              FuncInfo->MBB);
 
 
       CurDAG->setRoot(SDB->getRoot());
       SDB->clear();
-      BB = CodeGenAndEmitDAG(BB);
+      CodeGenAndEmitDAG();
     }
 
     // Update PHI Nodes
@@ -1001,22 +919,24 @@ SelectionDAGISel::FinishBasicBlock(MachineBasicBlock *BB) {
     // Lower header first, if it wasn't already lowered
     if (!SDB->JTCases[i].first.Emitted) {
       // Set the current basic block to the mbb we wish to insert the code into
-      BB = SDB->JTCases[i].first.HeaderBB;
+      FuncInfo->MBB = SDB->JTCases[i].first.HeaderBB;
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
       // Emit the code
       SDB->visitJumpTableHeader(SDB->JTCases[i].second, SDB->JTCases[i].first,
-                                BB);
+                                FuncInfo->MBB);
       CurDAG->setRoot(SDB->getRoot());
       SDB->clear();
-      BB = CodeGenAndEmitDAG(BB);
+      CodeGenAndEmitDAG();
     }
 
     // Set the current basic block to the mbb we wish to insert the code into
-    BB = SDB->JTCases[i].second.MBB;
+    FuncInfo->MBB = SDB->JTCases[i].second.MBB;
+    FuncInfo->InsertPt = FuncInfo->MBB->end();
     // Emit the code
     SDB->visitJumpTable(SDB->JTCases[i].second);
     CurDAG->setRoot(SDB->getRoot());
     SDB->clear();
-    BB = CodeGenAndEmitDAG(BB);
+    CodeGenAndEmitDAG();
 
     // Update PHI Nodes
     for (unsigned pi = 0, pe = FuncInfo->PHINodesToUpdate.size();
@@ -1034,11 +954,11 @@ SelectionDAGISel::FinishBasicBlock(MachineBasicBlock *BB) {
           (MachineOperand::CreateMBB(SDB->JTCases[i].first.HeaderBB));
       }
       // JT BB. Just iterate over successors here
-      if (BB->isSuccessor(PHIBB)) {
+      if (FuncInfo->MBB->isSuccessor(PHIBB)) {
         PHI->addOperand
           (MachineOperand::CreateReg(FuncInfo->PHINodesToUpdate[pi].second,
                                      false));
-        PHI->addOperand(MachineOperand::CreateMBB(BB));
+        PHI->addOperand(MachineOperand::CreateMBB(FuncInfo->MBB));
       }
     }
   }
@@ -1050,10 +970,10 @@ SelectionDAGISel::FinishBasicBlock(MachineBasicBlock *BB) {
     MachineInstr *PHI = FuncInfo->PHINodesToUpdate[i].first;
     assert(PHI->isPHI() &&
            "This is not a machine PHI node that we are updating!");
-    if (BB->isSuccessor(PHI->getParent())) {
+    if (FuncInfo->MBB->isSuccessor(PHI->getParent())) {
       PHI->addOperand(
         MachineOperand::CreateReg(FuncInfo->PHINodesToUpdate[i].second, false));
-      PHI->addOperand(MachineOperand::CreateMBB(BB));
+      PHI->addOperand(MachineOperand::CreateMBB(FuncInfo->MBB));
     }
   }
 
@@ -1061,7 +981,8 @@ SelectionDAGISel::FinishBasicBlock(MachineBasicBlock *BB) {
   // additional DAGs necessary.
   for (unsigned i = 0, e = SDB->SwitchCases.size(); i != e; ++i) {
     // Set the current basic block to the mbb we wish to insert the code into
-    MachineBasicBlock *ThisBB = BB = SDB->SwitchCases[i].ThisBB;
+    MachineBasicBlock *ThisBB = FuncInfo->MBB = SDB->SwitchCases[i].ThisBB;
+    FuncInfo->InsertPt = FuncInfo->MBB->end();
 
     // Determine the unique successors.
     SmallVector<MachineBasicBlock *, 2> Succs;
@@ -1071,21 +992,24 @@ SelectionDAGISel::FinishBasicBlock(MachineBasicBlock *BB) {
 
     // Emit the code. Note that this could result in ThisBB being split, so
     // we need to check for updates.
-    SDB->visitSwitchCase(SDB->SwitchCases[i], BB);
+    SDB->visitSwitchCase(SDB->SwitchCases[i], FuncInfo->MBB);
     CurDAG->setRoot(SDB->getRoot());
     SDB->clear();
-    ThisBB = CodeGenAndEmitDAG(BB);
+    CodeGenAndEmitDAG();
+    ThisBB = FuncInfo->MBB;
 
     // Handle any PHI nodes in successors of this chunk, as if we were coming
     // from the original BB before switch expansion.  Note that PHI nodes can
     // occur multiple times in PHINodesToUpdate.  We have to be very careful to
     // handle them the right number of times.
     for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
-      BB = Succs[i];
-      // BB may have been removed from the CFG if a branch was constant folded.
-      if (ThisBB->isSuccessor(BB)) {
-        for (MachineBasicBlock::iterator Phi = BB->begin();
-             Phi != BB->end() && Phi->isPHI();
+      FuncInfo->MBB = Succs[i];
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
+      // FuncInfo->MBB may have been removed from the CFG if a branch was
+      // constant folded.
+      if (ThisBB->isSuccessor(FuncInfo->MBB)) {
+        for (MachineBasicBlock::iterator Phi = FuncInfo->MBB->begin();
+             Phi != FuncInfo->MBB->end() && Phi->isPHI();
              ++Phi) {
           // This value for this PHI node is recorded in PHINodesToUpdate.
           for (unsigned pn = 0; ; ++pn) {
@@ -1205,6 +1129,7 @@ SelectInlineAsmMemoryOperands(std::vector<SDValue> &Ops) {
   Ops.push_back(InOps[InlineAsm::Op_InputChain]); // 0
   Ops.push_back(InOps[InlineAsm::Op_AsmString]);  // 1
   Ops.push_back(InOps[InlineAsm::Op_MDNode]);     // 2, !srcloc
+  Ops.push_back(InOps[InlineAsm::Op_IsAlignStack]);  // 3
 
   unsigned i = InlineAsm::Op_FirstOperand, e = InOps.size();
   if (InOps[e-1].getValueType() == MVT::Flag)
@@ -1701,7 +1626,7 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
                                       SDValue(Res, ResNumResults-1));
 
   if ((EmitNodeInfo & OPFL_FlagOutput) != 0)
-  --ResNumResults;
+    --ResNumResults;
 
   // Move the chain reference if needed.
   if ((EmitNodeInfo & OPFL_Chain) && OldChainResultNo != -1 &&
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
index 3786bd197b85..6cae804422ce 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -278,7 +278,7 @@ std::string ScheduleDAGSDNodes::getGraphNodeLabel(const SUnit *SU) const {
       FlaggedNodes.push_back(N);
     while (!FlaggedNodes.empty()) {
       O << DOTGraphTraits<SelectionDAG*>
-	     ::getSimpleNodeLabel(FlaggedNodes.back(), DAG);
+        ::getSimpleNodeLabel(FlaggedNodes.back(), DAG);
       FlaggedNodes.pop_back();
       if (!FlaggedNodes.empty())
         O << "\n    ";
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 44a80d3362d1..4f3866956cac 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -261,6 +262,38 @@ static void InitLibcallNames(const char **Names) {
   Names[RTLIB::MEMMOVE] = "memmove";
   Names[RTLIB::MEMSET] = "memset";
   Names[RTLIB::UNWIND_RESUME] = "_Unwind_Resume";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1] = "__sync_val_compare_and_swap_1";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] = "__sync_val_compare_and_swap_2";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4] = "__sync_val_compare_and_swap_4";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8] = "__sync_val_compare_and_swap_8";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_1] = "__sync_lock_test_and_set_1";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_2] = "__sync_lock_test_and_set_2";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_4] = "__sync_lock_test_and_set_4";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_8] = "__sync_lock_test_and_set_8";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_1] = "__sync_fetch_and_add_1";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_2] = "__sync_fetch_and_add_2";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_4] = "__sync_fetch_and_add_4";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_8] = "__sync_fetch_and_add_8";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_1] = "__sync_fetch_and_sub_1";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_2] = "__sync_fetch_and_sub_2";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_4] = "__sync_fetch_and_sub_4";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_8] = "__sync_fetch_and_sub_8";
+  Names[RTLIB::SYNC_FETCH_AND_AND_1] = "__sync_fetch_and_and_1";
+  Names[RTLIB::SYNC_FETCH_AND_AND_2] = "__sync_fetch_and_and_2";
+  Names[RTLIB::SYNC_FETCH_AND_AND_4] = "__sync_fetch_and_and_4";
+  Names[RTLIB::SYNC_FETCH_AND_AND_8] = "__sync_fetch_and_and_8";
+  Names[RTLIB::SYNC_FETCH_AND_OR_1] = "__sync_fetch_and_or_1";
+  Names[RTLIB::SYNC_FETCH_AND_OR_2] = "__sync_fetch_and_or_2";
+  Names[RTLIB::SYNC_FETCH_AND_OR_4] = "__sync_fetch_and_or_4";
+  Names[RTLIB::SYNC_FETCH_AND_OR_8] = "__sync_fetch_and_or_8";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_1] = "__sync_fetch_and_xor_1";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_2] = "__sync_fetch_and_xor_2";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_4] = "__sync_fetch_and-xor_4";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_8] = "__sync_fetch_and_xor_8";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_1] = "__sync_fetch_and_nand_1";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_2] = "__sync_fetch_and_nand_2";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_4] = "__sync_fetch_and_nand_4";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_8] = "__sync_fetch_and_nand_8";
 }
 
 /// InitLibcallCallingConvs - Set default libcall CallingConvs.
@@ -546,9 +579,9 @@ TargetLowering::TargetLowering(const TargetMachine &tm,
   SchedPreferenceInfo = Sched::Latency;
   JumpBufSize = 0;
   JumpBufAlignment = 0;
-  IfCvtBlockSizeLimit = 2;
-  IfCvtDupBlockSizeLimit = 0;
   PrefLoopAlignment = 0;
+  MinStackArgumentAlignment = 1;
+  ShouldFoldAtomicFences = false;
 
   InitLibcallNames(LibcallRoutineNames);
   InitCmpLibcallCCs(CmpLibcallCCs);
@@ -578,9 +611,9 @@ bool TargetLowering::canOpTrap(unsigned Op, EVT VT) const {
 
 
 static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
-                                       unsigned &NumIntermediates,
-                                       EVT &RegisterVT,
-                                       TargetLowering* TLI) {
+                                          unsigned &NumIntermediates,
+                                          EVT &RegisterVT,
+                                          TargetLowering *TLI) {
   // Figure out the right, legal destination reg to copy into.
   unsigned NumElts = VT.getVectorNumElements();
   MVT EltTy = VT.getVectorElementType();
@@ -610,16 +643,12 @@ static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
 
   EVT DestVT = TLI->getRegisterType(NewVT);
   RegisterVT = DestVT;
-  if (EVT(DestVT).bitsLT(NewVT)) {
-    // Value is expanded, e.g. i64 -> i16.
+  if (EVT(DestVT).bitsLT(NewVT))    // Value is expanded, e.g. i64 -> i16.
     return NumVectorRegs*(NewVT.getSizeInBits()/DestVT.getSizeInBits());
-  } else {
-    // Otherwise, promotion or legal types use the same number of registers as
-    // the vector decimated to the appropriate level.
-    return NumVectorRegs;
-  }
   
-  return 1;
+  // Otherwise, promotion or legal types use the same number of registers as
+  // the vector decimated to the appropriate level.
+  return NumVectorRegs;
 }
 
 /// computeRegisterProperties - Once all of the register classes are added,
@@ -705,39 +734,39 @@ void TargetLowering::computeRegisterProperties() {
   for (unsigned i = MVT::FIRST_VECTOR_VALUETYPE;
        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
     MVT VT = (MVT::SimpleValueType)i;
-    if (!isTypeLegal(VT)) {
-      MVT IntermediateVT;
-      EVT RegisterVT;
-      unsigned NumIntermediates;
-      NumRegistersForVT[i] =
-        getVectorTypeBreakdownMVT(VT, IntermediateVT, NumIntermediates,
-                                  RegisterVT, this);
-      RegisterTypeForVT[i] = RegisterVT;
-      
-      // Determine if there is a legal wider type.
-      bool IsLegalWiderType = false;
-      EVT EltVT = VT.getVectorElementType();
-      unsigned NElts = VT.getVectorNumElements();
-      for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
-        EVT SVT = (MVT::SimpleValueType)nVT;
-        if (isTypeSynthesizable(SVT) && SVT.getVectorElementType() == EltVT &&
-            SVT.getVectorNumElements() > NElts && NElts != 1) {
-          TransformToType[i] = SVT;
-          ValueTypeActions.setTypeAction(VT, Promote);
-          IsLegalWiderType = true;
-          break;
-        }
+    if (isTypeLegal(VT)) continue;
+    
+    MVT IntermediateVT;
+    EVT RegisterVT;
+    unsigned NumIntermediates;
+    NumRegistersForVT[i] =
+      getVectorTypeBreakdownMVT(VT, IntermediateVT, NumIntermediates,
+                                RegisterVT, this);
+    RegisterTypeForVT[i] = RegisterVT;
+    
+    // Determine if there is a legal wider type.
+    bool IsLegalWiderType = false;
+    EVT EltVT = VT.getVectorElementType();
+    unsigned NElts = VT.getVectorNumElements();
+    for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+      EVT SVT = (MVT::SimpleValueType)nVT;
+      if (isTypeSynthesizable(SVT) && SVT.getVectorElementType() == EltVT &&
+          SVT.getVectorNumElements() > NElts && NElts != 1) {
+        TransformToType[i] = SVT;
+        ValueTypeActions.setTypeAction(VT, Promote);
+        IsLegalWiderType = true;
+        break;
       }
-      if (!IsLegalWiderType) {
-        EVT NVT = VT.getPow2VectorType();
-        if (NVT == VT) {
-          // Type is already a power of 2.  The default action is to split.
-          TransformToType[i] = MVT::Other;
-          ValueTypeActions.setTypeAction(VT, Expand);
-        } else {
-          TransformToType[i] = NVT;
-          ValueTypeActions.setTypeAction(VT, Promote);
-        }
+    }
+    if (!IsLegalWiderType) {
+      EVT NVT = VT.getPow2VectorType();
+      if (NVT == VT) {
+        // Type is already a power of 2.  The default action is to split.
+        TransformToType[i] = MVT::Other;
+        ValueTypeActions.setTypeAction(VT, Expand);
+      } else {
+        TransformToType[i] = NVT;
+        ValueTypeActions.setTypeAction(VT, Promote);
       }
     }
   }
@@ -811,6 +840,65 @@ unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
   return 1;
 }
 
+/// Get the EVTs and ArgFlags collections that represent the legalized return 
+/// type of the given function.  This does not require a DAG or a return value,
+/// and is suitable for use before any DAGs for the function are constructed.
+/// TODO: Move this out of TargetLowering.cpp.
+void llvm::GetReturnInfo(const Type* ReturnType, Attributes attr,
+                         SmallVectorImpl<ISD::OutputArg> &Outs,
+                         const TargetLowering &TLI,
+                         SmallVectorImpl<uint64_t> *Offsets) {
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(TLI, ReturnType, ValueVTs);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0) return;
+  unsigned Offset = 0;
+
+  for (unsigned j = 0, f = NumValues; j != f; ++j) {
+    EVT VT = ValueVTs[j];
+    ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+
+    if (attr & Attribute::SExt)
+      ExtendKind = ISD::SIGN_EXTEND;
+    else if (attr & Attribute::ZExt)
+      ExtendKind = ISD::ZERO_EXTEND;
+
+    // FIXME: C calling convention requires the return type to be promoted to
+    // at least 32-bit. But this is not necessary for non-C calling
+    // conventions. The frontend should mark functions whose return values
+    // require promoting with signext or zeroext attributes.
+    if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger()) {
+      EVT MinVT = TLI.getRegisterType(ReturnType->getContext(), MVT::i32);
+      if (VT.bitsLT(MinVT))
+        VT = MinVT;
+    }
+
+    unsigned NumParts = TLI.getNumRegisters(ReturnType->getContext(), VT);
+    EVT PartVT = TLI.getRegisterType(ReturnType->getContext(), VT);
+    unsigned PartSize = TLI.getTargetData()->getTypeAllocSize(
+                        PartVT.getTypeForEVT(ReturnType->getContext()));
+
+    // 'inreg' on function refers to return value
+    ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+    if (attr & Attribute::InReg)
+      Flags.setInReg();
+
+    // Propagate extension type if any
+    if (attr & Attribute::SExt)
+      Flags.setSExt();
+    else if (attr & Attribute::ZExt)
+      Flags.setZExt();
+
+    for (unsigned i = 0; i < NumParts; ++i) {
+      Outs.push_back(ISD::OutputArg(Flags, PartVT, /*isFixed=*/true));
+      if (Offsets) {
+        Offsets->push_back(Offset);
+        Offset += PartSize;
+      }
+    }
+  }
+}
+
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
 /// function arguments in the caller parameter area.  This is the actual
 /// alignment, not its logarithm.
@@ -1042,7 +1130,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     if (TLO.ShrinkDemandedConstant(Op, ~KnownZero2 & NewMask))
       return true;
     // If the operation can be done in a smaller type, do so.
-    if (TLO.ShrinkOps && TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
       return true;
 
     // Output known-1 bits are only known if set in both the LHS & RHS.
@@ -1076,7 +1164,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     if (TLO.ShrinkDemandedConstant(Op, NewMask))
       return true;
     // If the operation can be done in a smaller type, do so.
-    if (TLO.ShrinkOps && TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
       return true;
 
     // Output known-0 bits are only known if clear in both the LHS & RHS.
@@ -1101,7 +1189,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     if ((KnownZero2 & NewMask) == NewMask)
       return TLO.CombineTo(Op, Op.getOperand(1));
     // If the operation can be done in a smaller type, do so.
-    if (TLO.ShrinkOps && TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
       return true;
 
     // If all of the unknown bits are known to be zero on one side or the other
@@ -1498,13 +1586,17 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     break;
   }
   case ISD::AssertZext: {
-    EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
-    APInt InMask = APInt::getLowBitsSet(BitWidth,
-                                        VT.getSizeInBits());
-    if (SimplifyDemandedBits(Op.getOperand(0), InMask & NewMask,
+    // Demand all the bits of the input that are demanded in the output.
+    // The low bits are obvious; the high bits are demanded because we're
+    // asserting that they're zero here.
+    if (SimplifyDemandedBits(Op.getOperand(0), NewMask,
                              KnownZero, KnownOne, TLO, Depth+1))
       return true;
     assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+
+    EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    APInt InMask = APInt::getLowBitsSet(BitWidth,
+                                        VT.getSizeInBits());
     KnownZero |= ~InMask & NewMask;
     break;
   }
@@ -1544,7 +1636,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
                              KnownOne2, TLO, Depth+1))
       return true;
     // See if the operation should be performed at a smaller bit width.
-    if (TLO.ShrinkOps && TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
       return true;
   }
   // FALL THROUGH
@@ -2346,7 +2438,6 @@ const char *TargetLowering::LowerXConstraint(EVT ConstraintVT) const{
 /// vector.  If it is invalid, don't add anything to Ops.
 void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                   char ConstraintLetter,
-                                                  bool hasMemory,
                                                   std::vector<SDValue> &Ops,
                                                   SelectionDAG &DAG) const {
   switch (ConstraintLetter) {
@@ -2384,7 +2475,8 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
       if (ConstraintLetter != 'n') {
         int64_t Offs = GA->getOffset();
         if (C) Offs += C->getZExtValue();
-        Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(),
+        Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), 
+                                                 C->getDebugLoc(),
                                                  Op.getValueType(), Offs));
         return;
       }
@@ -2507,18 +2599,18 @@ static unsigned getConstraintGenerality(TargetLowering::ConstraintType CT) {
 ///     'm' over 'r', for example.
 ///
 static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
-                             bool hasMemory,  const TargetLowering &TLI,
+                             const TargetLowering &TLI,
                              SDValue Op, SelectionDAG *DAG) {
   assert(OpInfo.Codes.size() > 1 && "Doesn't have multiple constraint options");
   unsigned BestIdx = 0;
   TargetLowering::ConstraintType BestType = TargetLowering::C_Unknown;
   int BestGenerality = -1;
-  
+
   // Loop over the options, keeping track of the most general one.
   for (unsigned i = 0, e = OpInfo.Codes.size(); i != e; ++i) {
     TargetLowering::ConstraintType CType =
       TLI.getConstraintType(OpInfo.Codes[i]);
-    
+
     // If this is an 'other' constraint, see if the operand is valid for it.
     // For example, on X86 we might have an 'rI' constraint.  If the operand
     // is an integer in the range [0..31] we want to use I (saving a load
@@ -2527,7 +2619,7 @@ static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
       assert(OpInfo.Codes[i].size() == 1 &&
              "Unhandled multi-letter 'other' constraint");
       std::vector<SDValue> ResultOps;
-      TLI.LowerAsmOperandForConstraint(Op, OpInfo.Codes[i][0], hasMemory,
+      TLI.LowerAsmOperandForConstraint(Op, OpInfo.Codes[i][0],
                                        ResultOps, *DAG);
       if (!ResultOps.empty()) {
         BestType = CType;
@@ -2536,6 +2628,11 @@ static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
       }
     }
     
+    // Things with matching constraints can only be registers, per gcc
+    // documentation.  This mainly affects "g" constraints.
+    if (CType == TargetLowering::C_Memory && OpInfo.hasMatchingInput())
+      continue;
+    
     // This constraint letter is more general than the previous one, use it.
     int Generality = getConstraintGenerality(CType);
     if (Generality > BestGenerality) {
@@ -2554,7 +2651,6 @@ static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
 /// OpInfo.ConstraintCode and OpInfo.ConstraintType.
 void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo,
                                             SDValue Op, 
-                                            bool hasMemory,
                                             SelectionDAG *DAG) const {
   assert(!OpInfo.Codes.empty() && "Must have at least one constraint");
   
@@ -2563,7 +2659,7 @@ void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo,
     OpInfo.ConstraintCode = OpInfo.Codes[0];
     OpInfo.ConstraintType = getConstraintType(OpInfo.ConstraintCode);
   } else {
-    ChooseConstraint(OpInfo, hasMemory, *this, Op, DAG);
+    ChooseConstraint(OpInfo, *this, Op, DAG);
   }
   
   // 'X' matches anything.
diff --git a/lib/CodeGen/ShadowStackGC.cpp b/lib/CodeGen/ShadowStackGC.cpp
index 5240bef5a5ff..6ab0cb03c065 100644
--- a/lib/CodeGen/ShadowStackGC.cpp
+++ b/lib/CodeGen/ShadowStackGC.cpp
@@ -31,6 +31,7 @@
 #include "llvm/CodeGen/GCStrategy.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/IRBuilder.h"
 
 using namespace llvm;
@@ -158,7 +159,8 @@ namespace {
 
           // Create a new invoke instruction.
           Args.clear();
-          Args.append(CI->op_begin() + 1, CI->op_end());
+          CallSite CS(CI);
+          Args.append(CS.arg_begin(), CS.arg_end());
 
           InvokeInst *II = InvokeInst::Create(CI->getCalledValue(),
                                               NewBB, CleanupBB,
@@ -194,7 +196,7 @@ Constant *ShadowStackGC::GetFrameMap(Function &F) {
   unsigned NumMeta = 0;
   SmallVector<Constant*,16> Metadata;
   for (unsigned I = 0; I != Roots.size(); ++I) {
-    Constant *C = cast<Constant>(Roots[I].first->getOperand(2));
+    Constant *C = cast<Constant>(Roots[I].first->getArgOperand(1));
     if (!C->isNullValue())
       NumMeta = I + 1;
     Metadata.push_back(ConstantExpr::getBitCast(C, VoidPtr));
@@ -322,16 +324,16 @@ void ShadowStackGC::CollectRoots(Function &F) {
 
   assert(Roots.empty() && "Not cleaned up?");
 
-  SmallVector<std::pair<CallInst*,AllocaInst*>,16> MetaRoots;
+  SmallVector<std::pair<CallInst*, AllocaInst*>, 16> MetaRoots;
 
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
     for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;)
       if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++))
         if (Function *F = CI->getCalledFunction())
           if (F->getIntrinsicID() == Intrinsic::gcroot) {
-            std::pair<CallInst*,AllocaInst*> Pair = std::make_pair(
-              CI, cast<AllocaInst>(CI->getOperand(1)->stripPointerCasts()));
-            if (IsNullValue(CI->getOperand(2)))
+            std::pair<CallInst*, AllocaInst*> Pair = std::make_pair(
+              CI, cast<AllocaInst>(CI->getArgOperand(0)->stripPointerCasts()));
+            if (IsNullValue(CI->getArgOperand(1)))
               Roots.push_back(Pair);
             else
               MetaRoots.push_back(Pair);
diff --git a/lib/CodeGen/SimpleHazardRecognizer.h b/lib/CodeGen/SimpleHazardRecognizer.h
deleted file mode 100644
index f69feaf9e570..000000000000
--- a/lib/CodeGen/SimpleHazardRecognizer.h
+++ /dev/null
@@ -1,89 +0,0 @@
-//=- llvm/CodeGen/SimpleHazardRecognizer.h - Scheduling Support -*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the SimpleHazardRecognizer class, which
-// implements hazard-avoidance heuristics for scheduling, based on the
-// scheduling itineraries specified for the target.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_SIMPLEHAZARDRECOGNIZER_H
-#define LLVM_CODEGEN_SIMPLEHAZARDRECOGNIZER_H
-
-#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
-#include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-namespace llvm {
-  /// SimpleHazardRecognizer - A *very* simple hazard recognizer. It uses
-  /// a coarse classification and attempts to avoid that instructions of
-  /// a given class aren't grouped too densely together.
-  class SimpleHazardRecognizer : public ScheduleHazardRecognizer {
-    /// Class - A simple classification for SUnits.
-    enum Class {
-      Other, Load, Store
-    };
-
-    /// Window - The Class values of the most recently issued
-    /// instructions.
-    Class Window[8];
-
-    /// getClass - Classify the given SUnit.
-    Class getClass(const SUnit *SU) {
-      const MachineInstr *MI = SU->getInstr();
-      const TargetInstrDesc &TID = MI->getDesc();
-      if (TID.mayLoad())
-        return Load;
-      if (TID.mayStore())
-        return Store;
-      return Other;
-    }
-
-    /// Step - Rotate the existing entries in Window and insert the
-    /// given class value in position as the most recent.
-    void Step(Class C) {
-      std::copy(Window+1, array_endof(Window), Window);
-      Window[array_lengthof(Window)-1] = C;
-    }
-
-  public:
-    SimpleHazardRecognizer() : Window() {
-      Reset();
-    }
-
-    virtual HazardType getHazardType(SUnit *SU) {
-      Class C = getClass(SU);
-      if (C == Other)
-        return NoHazard;
-      unsigned Score = 0;
-      for (unsigned i = 0; i != array_lengthof(Window); ++i)
-        if (Window[i] == C)
-          Score += i + 1;
-      if (Score > array_lengthof(Window) * 2)
-        return Hazard;
-      return NoHazard;
-    }
-
-    virtual void Reset() {
-      for (unsigned i = 0; i != array_lengthof(Window); ++i)
-        Window[i] = Other;
-    }
-
-    virtual void EmitInstruction(SUnit *SU) {
-      Step(getClass(SU));
-    }
-
-    virtual void AdvanceCycle() {
-      Step(Other);
-    }
-  };
-}
-
-#endif
diff --git a/lib/CodeGen/SimpleRegisterCoalescing.cpp b/lib/CodeGen/SimpleRegisterCoalescing.cpp
index ed3c243ff3e4..e69d3e4fa78a 100644
--- a/lib/CodeGen/SimpleRegisterCoalescing.cpp
+++ b/lib/CodeGen/SimpleRegisterCoalescing.cpp
@@ -99,15 +99,23 @@ void SimpleRegisterCoalescing::getAnalysisUsage(AnalysisUsage &AU) const {
 ///
 /// This returns true if an interval was modified.
 ///
-bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(LiveInterval &IntA,
-                                                    LiveInterval &IntB,
+bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(const CoalescerPair &CP,
                                                     MachineInstr *CopyMI) {
+  // Bail if there is no dst interval - can happen when merging physical subreg
+  // operations.
+  if (!li_->hasInterval(CP.getDstReg()))
+    return false;
+
+  LiveInterval &IntA =
+    li_->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+  LiveInterval &IntB =
+    li_->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
   SlotIndex CopyIdx = li_->getInstructionIndex(CopyMI).getDefIndex();
 
   // BValNo is a value number in B that is defined by a copy from A.  'B3' in
   // the example above.
   LiveInterval::iterator BLR = IntB.FindLiveRangeContaining(CopyIdx);
-  assert(BLR != IntB.end() && "Live range not found!");
+  if (BLR == IntB.end()) return false;
   VNInfo *BValNo = BLR->valno;
 
   // Get the location that B is defined at.  Two options: either this value has
@@ -119,7 +127,8 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(LiveInterval &IntA,
   // AValNo is the value number in A that defines the copy, A3 in the example.
   SlotIndex CopyUseIdx = CopyIdx.getUseIndex();
   LiveInterval::iterator ALR = IntA.FindLiveRangeContaining(CopyUseIdx);
-  assert(ALR != IntA.end() && "Live range not found!");
+  // The live range might not exist after fun with physreg coalescing.
+  if (ALR == IntA.end()) return false;
   VNInfo *AValNo = ALR->valno;
   // If it's re-defined by an early clobber somewhere in the live range, then
   // it's not safe to eliminate the copy. FIXME: This is a temporary workaround.
@@ -145,26 +154,21 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(LiveInterval &IntA,
 
   // If AValNo is defined as a copy from IntB, we can potentially process this.
   // Get the instruction that defines this value number.
-  unsigned SrcReg = li_->getVNInfoSourceReg(AValNo);
-  if (!SrcReg) return false;  // Not defined by a copy.
-
-  // If the value number is not defined by a copy instruction, ignore it.
-
-  // If the source register comes from an interval other than IntB, we can't
-  // handle this.
-  if (SrcReg != IntB.reg) return false;
+  if (!CP.isCoalescable(AValNo->getCopy()))
+    return false;
 
   // Get the LiveRange in IntB that this value number starts with.
   LiveInterval::iterator ValLR =
     IntB.FindLiveRangeContaining(AValNo->def.getPrevSlot());
-  assert(ValLR != IntB.end() && "Live range not found!");
+  if (ValLR == IntB.end())
+    return false;
 
   // Make sure that the end of the live range is inside the same block as
   // CopyMI.
   MachineInstr *ValLREndInst =
     li_->getInstructionFromIndex(ValLR->end.getPrevSlot());
-  if (!ValLREndInst ||
-      ValLREndInst->getParent() != CopyMI->getParent()) return false;
+  if (!ValLREndInst || ValLREndInst->getParent() != CopyMI->getParent())
+    return false;
 
   // Okay, we now know that ValLR ends in the same block that the CopyMI
   // live-range starts.  If there are no intervening live ranges between them in
@@ -207,6 +211,8 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(LiveInterval &IntA,
   // physreg has sub-registers, update their live intervals as well.
   if (TargetRegisterInfo::isPhysicalRegister(IntB.reg)) {
     for (const unsigned *SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR) {
+      if (!li_->hasInterval(*SR))
+        continue;
       LiveInterval &SRLI = li_->getInterval(*SR);
       SRLI.addRange(LiveRange(FillerStart, FillerEnd,
                               SRLI.getNextValue(FillerStart, 0, true,
@@ -216,7 +222,6 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(LiveInterval &IntA,
 
   // Okay, merge "B1" into the same value number as "B0".
   if (BValNo != ValLR->valno) {
-    IntB.addKills(ValLR->valno, BValNo->kills);
     IntB.MergeValueNumberInto(BValNo, ValLR->valno);
   }
   DEBUG({
@@ -230,13 +235,12 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(LiveInterval &IntA,
   int UIdx = ValLREndInst->findRegisterUseOperandIdx(IntB.reg, true);
   if (UIdx != -1) {
     ValLREndInst->getOperand(UIdx).setIsKill(false);
-    ValLR->valno->removeKill(FillerStart);
   }
 
   // If the copy instruction was killing the destination register before the
   // merge, find the last use and trim the live range. That will also add the
   // isKill marker.
-  if (ALR->valno->isKill(CopyIdx))
+  if (ALR->end == CopyIdx)
     TrimLiveIntervalToLastUse(CopyUseIdx, CopyMI->getParent(), IntA, ALR);
 
   ++numExtends;
@@ -304,23 +308,31 @@ TransferImplicitOps(MachineInstr *MI, MachineInstr *NewMI) {
 ///
 /// This returns true if an interval was modified.
 ///
-bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(LiveInterval &IntA,
-                                                        LiveInterval &IntB,
+bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(const CoalescerPair &CP,
                                                         MachineInstr *CopyMI) {
-  SlotIndex CopyIdx =
-    li_->getInstructionIndex(CopyMI).getDefIndex();
-
   // FIXME: For now, only eliminate the copy by commuting its def when the
   // source register is a virtual register. We want to guard against cases
   // where the copy is a back edge copy and commuting the def lengthen the
   // live interval of the source register to the entire loop.
-  if (TargetRegisterInfo::isPhysicalRegister(IntA.reg))
+  if (CP.isPhys() && CP.isFlipped())
+    return false;
+
+  // Bail if there is no dst interval.
+  if (!li_->hasInterval(CP.getDstReg()))
     return false;
 
+  SlotIndex CopyIdx =
+    li_->getInstructionIndex(CopyMI).getDefIndex();
+
+  LiveInterval &IntA =
+    li_->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+  LiveInterval &IntB =
+    li_->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+
   // BValNo is a value number in B that is defined by a copy from A. 'B3' in
   // the example above.
   LiveInterval::iterator BLR = IntB.FindLiveRangeContaining(CopyIdx);
-  assert(BLR != IntB.end() && "Live range not found!");
+  if (BLR == IntB.end()) return false;
   VNInfo *BValNo = BLR->valno;
 
   // Get the location that B is defined at.  Two options: either this value has
@@ -342,6 +354,8 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(LiveInterval &IntA,
       AValNo->isUnused() || AValNo->hasPHIKill())
     return false;
   MachineInstr *DefMI = li_->getInstructionFromIndex(AValNo->def);
+  if (!DefMI)
+    return false;
   const TargetInstrDesc &TID = DefMI->getDesc();
   if (!TID.isCommutable())
     return false;
@@ -380,7 +394,8 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(LiveInterval &IntA,
   // clobbers from the superreg.
   if (BHasSubRegs)
     for (const unsigned *SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR)
-      if (HasOtherReachingDefs(IntA, li_->getInterval(*SR), AValNo, 0))
+      if (li_->hasInterval(*SR) &&
+          HasOtherReachingDefs(IntA, li_->getInterval(*SR), AValNo, 0))
         return false;
 
   // If some of the uses of IntA.reg is already coalesced away, return false.
@@ -413,7 +428,6 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(LiveInterval &IntA,
 
   bool BHasPHIKill = BValNo->hasPHIKill();
   SmallVector<VNInfo*, 4> BDeadValNos;
-  VNInfo::KillSet BKills;
   std::map<SlotIndex, SlotIndex> BExtend;
 
   // If ALR and BLR overlaps and end of BLR extends beyond end of ALR, e.g.
@@ -424,8 +438,6 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(LiveInterval &IntA,
   // C = A<kill>
   // ...
   //   = B
-  //
-  // then do not add kills of A to the newly created B interval.
   bool Extended = BLR->end > ALR->end && ALR->end != ALR->start;
   if (Extended)
     BExtend[ALR->end] = BLR->end;
@@ -448,34 +460,38 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(LiveInterval &IntA,
     LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
     if (ULR == IntA.end() || ULR->valno != AValNo)
       continue;
-    UseMO.setReg(NewReg);
+    if (TargetRegisterInfo::isPhysicalRegister(NewReg))
+      UseMO.substPhysReg(NewReg, *tri_);
+    else
+      UseMO.setReg(NewReg);
     if (UseMI == CopyMI)
       continue;
     if (UseMO.isKill()) {
       if (Extended)
         UseMO.setIsKill(false);
-      else
-        BKills.push_back(UseIdx.getDefIndex());
     }
     unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-    if (!tii_->isMoveInstr(*UseMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx))
+    if (UseMI->isCopy()) {
+      if (UseMI->getOperand(0).getReg() != IntB.reg ||
+          UseMI->getOperand(0).getSubReg())
+        continue;
+    } else if (tii_->isMoveInstr(*UseMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)){
+      if (DstReg != IntB.reg || DstSubIdx)
+        continue;
+    } else
       continue;
-    if (DstReg == IntB.reg && DstSubIdx == 0) {
-      // This copy will become a noop. If it's defining a new val#,
-      // remove that val# as well. However this live range is being
-      // extended to the end of the existing live range defined by the copy.
-      SlotIndex DefIdx = UseIdx.getDefIndex();
-      const LiveRange *DLR = IntB.getLiveRangeContaining(DefIdx);
-      BHasPHIKill |= DLR->valno->hasPHIKill();
-      assert(DLR->valno->def == DefIdx);
-      BDeadValNos.push_back(DLR->valno);
-      BExtend[DLR->start] = DLR->end;
-      JoinedCopies.insert(UseMI);
-      // If this is a kill but it's going to be removed, the last use
-      // of the same val# is the new kill.
-      if (UseMO.isKill())
-        BKills.pop_back();
-    }
+    // This copy will become a noop. If it's defining a new val#,
+    // remove that val# as well. However this live range is being
+    // extended to the end of the existing live range defined by the copy.
+    SlotIndex DefIdx = UseIdx.getDefIndex();
+    const LiveRange *DLR = IntB.getLiveRangeContaining(DefIdx);
+    if (!DLR)
+      continue;
+    BHasPHIKill |= DLR->valno->hasPHIKill();
+    assert(DLR->valno->def == DefIdx);
+    BDeadValNos.push_back(DLR->valno);
+    BExtend[DLR->start] = DLR->end;
+    JoinedCopies.insert(UseMI);
   }
 
   // We need to insert a new liverange: [ALR.start, LastUse). It may be we can
@@ -490,24 +506,21 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(LiveInterval &IntA,
     VNInfo *DeadVNI = BDeadValNos[i];
     if (BHasSubRegs) {
       for (const unsigned *SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR) {
+        if (!li_->hasInterval(*SR))
+          continue;
         LiveInterval &SRLI = li_->getInterval(*SR);
-        const LiveRange *SRLR = SRLI.getLiveRangeContaining(DeadVNI->def);
-        SRLI.removeValNo(SRLR->valno);
+        if (const LiveRange *SRLR = SRLI.getLiveRangeContaining(DeadVNI->def))
+          SRLI.removeValNo(SRLR->valno);
       }
     }
     IntB.removeValNo(BDeadValNos[i]);
   }
 
   // Extend BValNo by merging in IntA live ranges of AValNo. Val# definition
-  // is updated. Kills are also updated.
+  // is updated.
   VNInfo *ValNo = BValNo;
   ValNo->def = AValNo->def;
   ValNo->setCopy(0);
-  for (unsigned j = 0, ee = ValNo->kills.size(); j != ee; ++j) {
-    if (ValNo->kills[j] != BLR->end)
-      BKills.push_back(ValNo->kills[j]);
-  }
-  ValNo->kills.clear();
   for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
        AI != AE; ++AI) {
     if (AI->valno != AValNo) continue;
@@ -517,18 +530,7 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(LiveInterval &IntA,
     if (EI != BExtend.end())
       End = EI->second;
     IntB.addRange(LiveRange(AI->start, End, ValNo));
-
-    // If the IntB live range is assigned to a physical register, and if that
-    // physreg has sub-registers, update their live intervals as well.
-    if (BHasSubRegs) {
-      for (const unsigned *SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR) {
-        LiveInterval &SRLI = li_->getInterval(*SR);
-        SRLI.MergeInClobberRange(*li_, AI->start, End,
-                                 li_->getVNInfoAllocator());
-      }
-    }
   }
-  IntB.addKills(ValNo, BKills);
   ValNo->setHasPHIKill(BHasPHIKill);
 
   DEBUG({
@@ -621,7 +623,11 @@ SimpleRegisterCoalescing::TrimLiveIntervalToLastUse(SlotIndex CopyIdx,
     // of last use.
     LastUse->setIsKill();
     removeRange(li, LastUseIdx.getDefIndex(), LR->end, li_, tri_);
-    LR->valno->addKill(LastUseIdx.getDefIndex());
+    if (LastUseMI->isCopy()) {
+      MachineOperand &DefMO = LastUseMI->getOperand(0);
+      if (DefMO.getReg() == li.reg && !DefMO.getSubReg())
+        DefMO.setIsDead();
+    }
     unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
     if (tii_->isMoveInstr(*LastUseMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx) &&
         DstReg == li.reg && DstSubIdx == 0) {
@@ -663,6 +669,7 @@ bool SimpleRegisterCoalescing::ReMaterializeTrivialDef(LiveInterval &SrcInt,
       ValNo->isUnused() || ValNo->hasPHIKill())
     return false;
   MachineInstr *DefMI = li_->getInstructionFromIndex(ValNo->def);
+  assert(DefMI && "Defining instruction disappeared");
   const TargetInstrDesc &TID = DefMI->getDesc();
   if (!TID.isAsCheapAsAMove())
     return false;
@@ -701,33 +708,20 @@ bool SimpleRegisterCoalescing::ReMaterializeTrivialDef(LiveInterval &SrcInt,
       return false;
   }
 
-  SlotIndex DefIdx = CopyIdx.getDefIndex();
-  const LiveRange *DLR= li_->getInterval(DstReg).getLiveRangeContaining(DefIdx);
-  DLR->valno->setCopy(0);
-  // Don't forget to update sub-register intervals.
-  if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
-    for (const unsigned* SR = tri_->getSubRegisters(DstReg); *SR; ++SR) {
-      if (!li_->hasInterval(*SR))
-        continue;
-      const LiveRange *DLR =
-          li_->getInterval(*SR).getLiveRangeContaining(DefIdx);
-      if (DLR && DLR->valno->getCopy() == CopyMI)
-        DLR->valno->setCopy(0);
-    }
-  }
+  RemoveCopyFlag(DstReg, CopyMI);
 
   // If copy kills the source register, find the last use and propagate
   // kill.
   bool checkForDeadDef = false;
   MachineBasicBlock *MBB = CopyMI->getParent();
-  if (SrcLR->valno->isKill(DefIdx))
+  if (SrcLR->end == CopyIdx.getDefIndex())
     if (!TrimLiveIntervalToLastUse(CopyIdx, MBB, SrcInt, SrcLR)) {
       checkForDeadDef = true;
     }
 
   MachineBasicBlock::iterator MII =
     llvm::next(MachineBasicBlock::iterator(CopyMI));
-  tii_->reMaterialize(*MBB, MII, DstReg, DstSubIdx, DefMI, tri_);
+  tii_->reMaterialize(*MBB, MII, DstReg, DstSubIdx, DefMI, *tri_);
   MachineInstr *NewMI = prior(MII);
 
   if (checkForDeadDef) {
@@ -747,24 +741,8 @@ bool SimpleRegisterCoalescing::ReMaterializeTrivialDef(LiveInterval &SrcInt,
     MachineOperand &MO = CopyMI->getOperand(i);
     if (MO.isReg() && MO.isImplicit())
       NewMI->addOperand(MO);
-    if (MO.isDef() && li_->hasInterval(MO.getReg())) {
-      unsigned Reg = MO.getReg();
-      const LiveRange *DLR =
-          li_->getInterval(Reg).getLiveRangeContaining(DefIdx);
-      if (DLR && DLR->valno->getCopy() == CopyMI)
-        DLR->valno->setCopy(0);
-      // Handle subregs as well
-      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-        for (const unsigned* SR = tri_->getSubRegisters(Reg); *SR; ++SR) {
-          if (!li_->hasInterval(*SR))
-            continue;
-          const LiveRange *DLR =
-              li_->getInterval(*SR).getLiveRangeContaining(DefIdx);
-          if (DLR && DLR->valno->getCopy() == CopyMI)
-            DLR->valno->setCopy(0);
-        }
-      }
-    }
+    if (MO.isDef())
+      RemoveCopyFlag(MO.getReg(), CopyMI);
   }
 
   TransferImplicitOps(CopyMI, NewMI);
@@ -783,84 +761,72 @@ bool SimpleRegisterCoalescing::ReMaterializeTrivialDef(LiveInterval &SrcInt,
 /// being updated is not zero, make sure to set it to the correct physical
 /// subregister.
 void
-SimpleRegisterCoalescing::UpdateRegDefsUses(unsigned SrcReg, unsigned DstReg,
-                                            unsigned SubIdx) {
-  bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
-  if (DstIsPhys && SubIdx) {
-    // Figure out the real physical register we are updating with.
-    DstReg = tri_->getSubReg(DstReg, SubIdx);
-    SubIdx = 0;
-  }
-
-  // Copy the register use-list before traversing it. We may be adding operands
-  // and invalidating pointers.
-  SmallVector<std::pair<MachineInstr*, unsigned>, 32> reglist;
-  for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(SrcReg),
-         E = mri_->reg_end(); I != E; ++I)
-    reglist.push_back(std::make_pair(&*I, I.getOperandNo()));
-
-  for (unsigned N=0; N != reglist.size(); ++N) {
-    MachineInstr *UseMI = reglist[N].first;
-    MachineOperand &O = UseMI->getOperand(reglist[N].second);
-    unsigned OldSubIdx = O.getSubReg();
+SimpleRegisterCoalescing::UpdateRegDefsUses(const CoalescerPair &CP) {
+  bool DstIsPhys = CP.isPhys();
+  unsigned SrcReg = CP.getSrcReg();
+  unsigned DstReg = CP.getDstReg();
+  unsigned SubIdx = CP.getSubIdx();
+
+  for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(SrcReg);
+       MachineInstr *UseMI = I.skipInstruction();) {
+    // A PhysReg copy that won't be coalesced can perhaps be rematerialized
+    // instead.
     if (DstIsPhys) {
-      unsigned UseDstReg = DstReg;
-      if (OldSubIdx)
-          UseDstReg = tri_->getSubReg(DstReg, OldSubIdx);
-
       unsigned CopySrcReg, CopyDstReg, CopySrcSubIdx, CopyDstSubIdx;
       if (tii_->isMoveInstr(*UseMI, CopySrcReg, CopyDstReg,
                             CopySrcSubIdx, CopyDstSubIdx) &&
-          CopySrcSubIdx == 0 &&
-          CopyDstSubIdx == 0 &&
-          CopySrcReg != CopyDstReg &&
-          CopySrcReg == SrcReg && CopyDstReg != UseDstReg) {
-        // If the use is a copy and it won't be coalesced away, and its source
-        // is defined by a trivial computation, try to rematerialize it instead.
-        if (!JoinedCopies.count(UseMI) &&
-            ReMaterializeTrivialDef(li_->getInterval(SrcReg), CopyDstReg,
-                                    CopyDstSubIdx, UseMI))
-          continue;
-      }
+          CopySrcSubIdx == 0 && CopyDstSubIdx == 0 &&
+          CopySrcReg != CopyDstReg && CopySrcReg == SrcReg &&
+          CopyDstReg != DstReg && !JoinedCopies.count(UseMI) &&
+          ReMaterializeTrivialDef(li_->getInterval(SrcReg), CopyDstReg, 0,
+                                  UseMI))
+        continue;
 
-      O.setReg(UseDstReg);
-      O.setSubReg(0);
-      if (OldSubIdx) {
-        // Def and kill of subregister of a virtual register actually defs and
-        // kills the whole register. Add imp-defs and imp-kills as needed.
-        if (O.isDef()) {
-          if(O.isDead())
-            UseMI->addRegisterDead(DstReg, tri_, true);
-          else
-            UseMI->addRegisterDefined(DstReg, tri_);
-        } else if (!O.isUndef() &&
-                   (O.isKill() ||
-                    UseMI->isRegTiedToDefOperand(&O-&UseMI->getOperand(0))))
-          UseMI->addRegisterKilled(DstReg, tri_, true);
-      }
+      if (UseMI->isCopy() &&
+          !UseMI->getOperand(1).getSubReg() &&
+          !UseMI->getOperand(0).getSubReg() &&
+          UseMI->getOperand(1).getReg() == SrcReg &&
+          UseMI->getOperand(0).getReg() != SrcReg &&
+          UseMI->getOperand(0).getReg() != DstReg &&
+          !JoinedCopies.count(UseMI) &&
+          ReMaterializeTrivialDef(li_->getInterval(SrcReg),
+                                  UseMI->getOperand(0).getReg(), 0, UseMI))
+        continue;
+    }
 
-      DEBUG({
-          dbgs() << "\t\tupdated: ";
-          if (!UseMI->isDebugValue())
-            dbgs() << li_->getInstructionIndex(UseMI) << "\t";
-          dbgs() << *UseMI;
-        });
-      continue;
+    SmallVector<unsigned,8> Ops;
+    bool Reads, Writes;
+    tie(Reads, Writes) = UseMI->readsWritesVirtualRegister(SrcReg, &Ops);
+    bool Kills = false, Deads = false;
+
+    // Replace SrcReg with DstReg in all UseMI operands.
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      MachineOperand &MO = UseMI->getOperand(Ops[i]);
+      Kills |= MO.isKill();
+      Deads |= MO.isDead();
+
+      if (DstIsPhys)
+        MO.substPhysReg(DstReg, *tri_);
+      else
+        MO.substVirtReg(DstReg, SubIdx, *tri_);
     }
 
-    // Sub-register indexes goes from small to large. e.g.
-    // RAX: 1 -> AL, 2 -> AX, 3 -> EAX
-    // EAX: 1 -> AL, 2 -> AX
-    // So RAX's sub-register 2 is AX, RAX's sub-regsiter 3 is EAX, whose
-    // sub-register 2 is also AX.
-    //
-    // FIXME: Properly compose subreg indices for all targets.
-    //
-    if (SubIdx && OldSubIdx && SubIdx != OldSubIdx)
-      ;
-    else if (SubIdx)
-      O.setSubReg(SubIdx);
-    O.setReg(DstReg);
+    // This instruction is a copy that will be removed.
+    if (JoinedCopies.count(UseMI))
+      continue;
+
+    if (SubIdx) {
+      // If UseMI was a simple SrcReg def, make sure we didn't turn it into a
+      // read-modify-write of DstReg.
+      if (Deads)
+        UseMI->addRegisterDead(DstReg, tri_);
+      else if (!Reads && Writes)
+        UseMI->addRegisterDefined(DstReg, tri_);
+
+      // Kill flags apply to the whole physical register.
+      if (DstIsPhys && Kills)
+        UseMI->addRegisterKilled(DstReg, tri_);
+    }
 
     DEBUG({
         dbgs() << "\t\tupdated: ";
@@ -869,15 +835,15 @@ SimpleRegisterCoalescing::UpdateRegDefsUses(unsigned SrcReg, unsigned DstReg,
         dbgs() << *UseMI;
       });
 
+
     // After updating the operand, check if the machine instruction has
     // become a copy. If so, update its val# information.
-    if (JoinedCopies.count(UseMI))
+    const TargetInstrDesc &TID = UseMI->getDesc();
+    if (DstIsPhys || TID.getNumDefs() != 1 || TID.getNumOperands() <= 2)
       continue;
 
-    const TargetInstrDesc &TID = UseMI->getDesc();
     unsigned CopySrcReg, CopyDstReg, CopySrcSubIdx, CopyDstSubIdx;
-    if (TID.getNumDefs() == 1 && TID.getNumOperands() > 2 &&
-        tii_->isMoveInstr(*UseMI, CopySrcReg, CopyDstReg,
+    if (tii_->isMoveInstr(*UseMI, CopySrcReg, CopyDstReg,
                           CopySrcSubIdx, CopyDstSubIdx) &&
         CopySrcReg != CopyDstReg &&
         (TargetRegisterInfo::isVirtualRegister(CopyDstReg) ||
@@ -945,6 +911,27 @@ bool SimpleRegisterCoalescing::RemoveDeadDef(LiveInterval &li,
   return removeIntervalIfEmpty(li, li_, tri_);
 }
 
+void SimpleRegisterCoalescing::RemoveCopyFlag(unsigned DstReg,
+                                              const MachineInstr *CopyMI) {
+  SlotIndex DefIdx = li_->getInstructionIndex(CopyMI).getDefIndex();
+  if (li_->hasInterval(DstReg)) {
+    LiveInterval &LI = li_->getInterval(DstReg);
+    if (const LiveRange *LR = LI.getLiveRangeContaining(DefIdx))
+      if (LR->valno->getCopy() == CopyMI)
+        LR->valno->setCopy(0);
+  }
+  if (!TargetRegisterInfo::isPhysicalRegister(DstReg))
+    return;
+  for (const unsigned* AS = tri_->getAliasSet(DstReg); *AS; ++AS) {
+    if (!li_->hasInterval(*AS))
+      continue;
+    LiveInterval &LI = li_->getInterval(*AS);
+    if (const LiveRange *LR = LI.getLiveRangeContaining(DefIdx))
+      if (LR->valno->getCopy() == CopyMI)
+        LR->valno->setCopy(0);
+  }
+}
+
 /// PropagateDeadness - Propagate the dead marker to the instruction which
 /// defines the val#.
 static void PropagateDeadness(LiveInterval &li, MachineInstr *CopyMI,
@@ -978,8 +965,8 @@ SimpleRegisterCoalescing::ShortenDeadCopySrcLiveRange(LiveInterval &li,
     // Live-in to the function but dead. Remove it from entry live-in set.
     if (mf_->begin()->isLiveIn(li.reg))
       mf_->begin()->removeLiveIn(li.reg);
-    const LiveRange *LR = li.getLiveRangeContaining(CopyIdx);
-    removeRange(li, LR->start, LR->end, li_, tri_);
+    if (const LiveRange *LR = li.getLiveRangeContaining(CopyIdx))
+      removeRange(li, LR->start, LR->end, li_, tri_);
     return removeIntervalIfEmpty(li, li_, tri_);
   }
 
@@ -1017,147 +1004,12 @@ SimpleRegisterCoalescing::ShortenDeadCopySrcLiveRange(LiveInterval &li,
     // val#, then propagate the dead marker.
     PropagateDeadness(li, CopyMI, RemoveStart, li_, tri_);
     ++numDeadValNo;
-
-    if (LR->valno->isKill(RemoveEnd))
-      LR->valno->removeKill(RemoveEnd);
   }
 
   removeRange(li, RemoveStart, RemoveEnd, li_, tri_);
   return removeIntervalIfEmpty(li, li_, tri_);
 }
 
-/// CanCoalesceWithImpDef - Returns true if the specified copy instruction
-/// from an implicit def to another register can be coalesced away.
-bool SimpleRegisterCoalescing::CanCoalesceWithImpDef(MachineInstr *CopyMI,
-                                                     LiveInterval &li,
-                                                     LiveInterval &ImpLi) const{
-  if (!CopyMI->killsRegister(ImpLi.reg))
-    return false;
-  // Make sure this is the only use.
-  for (MachineRegisterInfo::use_iterator UI = mri_->use_begin(ImpLi.reg),
-         UE = mri_->use_end(); UI != UE;) {
-    MachineInstr *UseMI = &*UI;
-    ++UI;
-    if (CopyMI == UseMI || JoinedCopies.count(UseMI))
-      continue;
-    return false;
-  }
-  return true;
-}
-
-
-/// isWinToJoinVRWithSrcPhysReg - Return true if it's worth while to join a
-/// a virtual destination register with physical source register.
-bool
-SimpleRegisterCoalescing::isWinToJoinVRWithSrcPhysReg(MachineInstr *CopyMI,
-                                                     MachineBasicBlock *CopyMBB,
-                                                     LiveInterval &DstInt,
-                                                     LiveInterval &SrcInt) {
-  // If the virtual register live interval is long but it has low use desity,
-  // do not join them, instead mark the physical register as its allocation
-  // preference.
-  const TargetRegisterClass *RC = mri_->getRegClass(DstInt.reg);
-  unsigned Threshold = allocatableRCRegs_[RC].count() * 2;
-  unsigned Length = li_->getApproximateInstructionCount(DstInt);
-  if (Length > Threshold &&
-      std::distance(mri_->use_nodbg_begin(DstInt.reg),
-                    mri_->use_nodbg_end()) * Threshold < Length)
-    return false;
-
-  // If the virtual register live interval extends into a loop, turn down
-  // aggressiveness.
-  SlotIndex CopyIdx =
-    li_->getInstructionIndex(CopyMI).getDefIndex();
-  const MachineLoop *L = loopInfo->getLoopFor(CopyMBB);
-  if (!L) {
-    // Let's see if the virtual register live interval extends into the loop.
-    LiveInterval::iterator DLR = DstInt.FindLiveRangeContaining(CopyIdx);
-    assert(DLR != DstInt.end() && "Live range not found!");
-    DLR = DstInt.FindLiveRangeContaining(DLR->end.getNextSlot());
-    if (DLR != DstInt.end()) {
-      CopyMBB = li_->getMBBFromIndex(DLR->start);
-      L = loopInfo->getLoopFor(CopyMBB);
-    }
-  }
-
-  if (!L || Length <= Threshold)
-    return true;
-
-  SlotIndex UseIdx = CopyIdx.getUseIndex();
-  LiveInterval::iterator SLR = SrcInt.FindLiveRangeContaining(UseIdx);
-  MachineBasicBlock *SMBB = li_->getMBBFromIndex(SLR->start);
-  if (loopInfo->getLoopFor(SMBB) != L) {
-    if (!loopInfo->isLoopHeader(CopyMBB))
-      return false;
-    // If vr's live interval extends pass the loop header, do not join.
-    for (MachineBasicBlock::succ_iterator SI = CopyMBB->succ_begin(),
-           SE = CopyMBB->succ_end(); SI != SE; ++SI) {
-      MachineBasicBlock *SuccMBB = *SI;
-      if (SuccMBB == CopyMBB)
-        continue;
-      if (DstInt.overlaps(li_->getMBBStartIdx(SuccMBB),
-                          li_->getMBBEndIdx(SuccMBB)))
-        return false;
-    }
-  }
-  return true;
-}
-
-/// isWinToJoinVRWithDstPhysReg - Return true if it's worth while to join a
-/// copy from a virtual source register to a physical destination register.
-bool
-SimpleRegisterCoalescing::isWinToJoinVRWithDstPhysReg(MachineInstr *CopyMI,
-                                                     MachineBasicBlock *CopyMBB,
-                                                     LiveInterval &DstInt,
-                                                     LiveInterval &SrcInt) {
-  // If the virtual register live interval is long but it has low use density,
-  // do not join them, instead mark the physical register as its allocation
-  // preference.
-  const TargetRegisterClass *RC = mri_->getRegClass(SrcInt.reg);
-  unsigned Threshold = allocatableRCRegs_[RC].count() * 2;
-  unsigned Length = li_->getApproximateInstructionCount(SrcInt);
-  if (Length > Threshold &&
-      std::distance(mri_->use_nodbg_begin(SrcInt.reg),
-                    mri_->use_nodbg_end()) * Threshold < Length)
-    return false;
-
-  if (SrcInt.empty())
-    // Must be implicit_def.
-    return false;
-
-  // If the virtual register live interval is defined or cross a loop, turn
-  // down aggressiveness.
-  SlotIndex CopyIdx =
-    li_->getInstructionIndex(CopyMI).getDefIndex();
-  SlotIndex UseIdx = CopyIdx.getUseIndex();
-  LiveInterval::iterator SLR = SrcInt.FindLiveRangeContaining(UseIdx);
-  assert(SLR != SrcInt.end() && "Live range not found!");
-  SLR = SrcInt.FindLiveRangeContaining(SLR->start.getPrevSlot());
-  if (SLR == SrcInt.end())
-    return true;
-  MachineBasicBlock *SMBB = li_->getMBBFromIndex(SLR->start);
-  const MachineLoop *L = loopInfo->getLoopFor(SMBB);
-
-  if (!L || Length <= Threshold)
-    return true;
-
-  if (loopInfo->getLoopFor(CopyMBB) != L) {
-    if (SMBB != L->getLoopLatch())
-      return false;
-    // If vr's live interval is extended from before the loop latch, do not
-    // join.
-    for (MachineBasicBlock::pred_iterator PI = SMBB->pred_begin(),
-           PE = SMBB->pred_end(); PI != PE; ++PI) {
-      MachineBasicBlock *PredMBB = *PI;
-      if (PredMBB == SMBB)
-        continue;
-      if (SrcInt.overlaps(li_->getMBBStartIdx(PredMBB),
-                          li_->getMBBEndIdx(PredMBB)))
-        return false;
-    }
-  }
-  return true;
-}
 
 /// isWinToJoinCrossClass - Return true if it's profitable to coalesce
 /// two virtual registers from different register classes.
@@ -1203,157 +1055,6 @@ SimpleRegisterCoalescing::isWinToJoinCrossClass(unsigned SrcReg,
   return true;
 }
 
-/// HasIncompatibleSubRegDefUse - If we are trying to coalesce a virtual
-/// register with a physical register, check if any of the virtual register
-/// operand is a sub-register use or def. If so, make sure it won't result
-/// in an illegal extract_subreg or insert_subreg instruction. e.g.
-/// vr1024 = extract_subreg vr1025, 1
-/// ...
-/// vr1024 = mov8rr AH
-/// If vr1024 is coalesced with AH, the extract_subreg is now illegal since
-/// AH does not have a super-reg whose sub-register 1 is AH.
-bool
-SimpleRegisterCoalescing::HasIncompatibleSubRegDefUse(MachineInstr *CopyMI,
-                                                      unsigned VirtReg,
-                                                      unsigned PhysReg) {
-  for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(VirtReg),
-         E = mri_->reg_end(); I != E; ++I) {
-    MachineOperand &O = I.getOperand();
-    if (O.isDebug())
-      continue;
-    MachineInstr *MI = &*I;
-    if (MI == CopyMI || JoinedCopies.count(MI))
-      continue;
-    unsigned SubIdx = O.getSubReg();
-    if (SubIdx && !tri_->getSubReg(PhysReg, SubIdx))
-      return true;
-    if (MI->isExtractSubreg()) {
-      SubIdx = MI->getOperand(2).getImm();
-      if (O.isUse() && !tri_->getSubReg(PhysReg, SubIdx))
-        return true;
-      if (O.isDef()) {
-        unsigned SrcReg = MI->getOperand(1).getReg();
-        const TargetRegisterClass *RC =
-          TargetRegisterInfo::isPhysicalRegister(SrcReg)
-          ? tri_->getPhysicalRegisterRegClass(SrcReg)
-          : mri_->getRegClass(SrcReg);
-        if (!tri_->getMatchingSuperReg(PhysReg, SubIdx, RC))
-          return true;
-      }
-    }
-    if (MI->isInsertSubreg() || MI->isSubregToReg()) {
-      SubIdx = MI->getOperand(3).getImm();
-      if (VirtReg == MI->getOperand(0).getReg()) {
-        if (!tri_->getSubReg(PhysReg, SubIdx))
-          return true;
-      } else {
-        unsigned DstReg = MI->getOperand(0).getReg();
-        const TargetRegisterClass *RC =
-          TargetRegisterInfo::isPhysicalRegister(DstReg)
-          ? tri_->getPhysicalRegisterRegClass(DstReg)
-          : mri_->getRegClass(DstReg);
-        if (!tri_->getMatchingSuperReg(PhysReg, SubIdx, RC))
-          return true;
-      }
-    }
-  }
-  return false;
-}
-
-
-/// CanJoinExtractSubRegToPhysReg - Return true if it's possible to coalesce
-/// an extract_subreg where dst is a physical register, e.g.
-/// cl = EXTRACT_SUBREG reg1024, 1
-bool
-SimpleRegisterCoalescing::CanJoinExtractSubRegToPhysReg(unsigned DstReg,
-                                               unsigned SrcReg, unsigned SubIdx,
-                                               unsigned &RealDstReg) {
-  const TargetRegisterClass *RC = mri_->getRegClass(SrcReg);
-  RealDstReg = tri_->getMatchingSuperReg(DstReg, SubIdx, RC);
-  if (!RealDstReg) {
-    DEBUG(dbgs() << "\tIncompatible source regclass: "
-                 << "none of the super-registers of " << tri_->getName(DstReg)
-                 << " are in " << RC->getName() << ".\n");
-    return false;
-  }
-
-  LiveInterval &RHS = li_->getInterval(SrcReg);
-  // For this type of EXTRACT_SUBREG, conservatively
-  // check if the live interval of the source register interfere with the
-  // actual super physical register we are trying to coalesce with.
-  if (li_->hasInterval(RealDstReg) &&
-      RHS.overlaps(li_->getInterval(RealDstReg))) {
-    DEBUG({
-        dbgs() << "\t\tInterfere with register ";
-        li_->getInterval(RealDstReg).print(dbgs(), tri_);
-      });
-    return false; // Not coalescable
-  }
-  for (const unsigned* SR = tri_->getSubRegisters(RealDstReg); *SR; ++SR)
-    // Do not check DstReg or its sub-register. JoinIntervals() will take care
-    // of that.
-    if (*SR != DstReg &&
-        !tri_->isSubRegister(DstReg, *SR) &&
-        li_->hasInterval(*SR) && RHS.overlaps(li_->getInterval(*SR))) {
-      DEBUG({
-          dbgs() << "\t\tInterfere with sub-register ";
-          li_->getInterval(*SR).print(dbgs(), tri_);
-        });
-      return false; // Not coalescable
-    }
-  return true;
-}
-
-/// CanJoinInsertSubRegToPhysReg - Return true if it's possible to coalesce
-/// an insert_subreg where src is a physical register, e.g.
-/// reg1024 = INSERT_SUBREG reg1024, c1, 0
-bool
-SimpleRegisterCoalescing::CanJoinInsertSubRegToPhysReg(unsigned DstReg,
-                                               unsigned SrcReg, unsigned SubIdx,
-                                               unsigned &RealSrcReg) {
-  const TargetRegisterClass *RC = mri_->getRegClass(DstReg);
-  RealSrcReg = tri_->getMatchingSuperReg(SrcReg, SubIdx, RC);
-  if (!RealSrcReg) {
-    DEBUG(dbgs() << "\tIncompatible destination regclass: "
-                 << "none of the super-registers of " << tri_->getName(SrcReg)
-                 << " are in " << RC->getName() << ".\n");
-    return false;
-  }
-
-  LiveInterval &LHS = li_->getInterval(DstReg);
-  if (li_->hasInterval(RealSrcReg) &&
-      LHS.overlaps(li_->getInterval(RealSrcReg))) {
-    DEBUG({
-        dbgs() << "\t\tInterfere with register ";
-        li_->getInterval(RealSrcReg).print(dbgs(), tri_);
-      });
-    return false; // Not coalescable
-  }
-  for (const unsigned* SR = tri_->getSubRegisters(RealSrcReg); *SR; ++SR)
-    // Do not check SrcReg or its sub-register. JoinIntervals() will take care
-    // of that.
-    if (*SR != SrcReg &&
-        !tri_->isSubRegister(SrcReg, *SR) &&
-        li_->hasInterval(*SR) && LHS.overlaps(li_->getInterval(*SR))) {
-      DEBUG({
-          dbgs() << "\t\tInterfere with sub-register ";
-          li_->getInterval(*SR).print(dbgs(), tri_);
-        });
-      return false; // Not coalescable
-    }
-  return true;
-}
-
-/// getRegAllocPreference - Return register allocation preference register.
-///
-static unsigned getRegAllocPreference(unsigned Reg, MachineFunction &MF,
-                                      MachineRegisterInfo *MRI,
-                                      const TargetRegisterInfo *TRI) {
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
-    return 0;
-  std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg);
-  return TRI->ResolveRegAllocHint(Hint.first, Hint.second, MF);
-}
 
 /// JoinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
 /// which are the src/dst of the copy instruction CopyMI.  This returns true
@@ -1369,354 +1070,97 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
 
   DEBUG(dbgs() << li_->getInstructionIndex(CopyMI) << '\t' << *CopyMI);
 
-  unsigned SrcReg, DstReg, SrcSubIdx = 0, DstSubIdx = 0;
-  bool isExtSubReg = CopyMI->isExtractSubreg();
-  bool isInsSubReg = CopyMI->isInsertSubreg();
-  bool isSubRegToReg = CopyMI->isSubregToReg();
-  unsigned SubIdx = 0;
-  if (isExtSubReg) {
-    DstReg    = CopyMI->getOperand(0).getReg();
-    DstSubIdx = CopyMI->getOperand(0).getSubReg();
-    SrcReg    = CopyMI->getOperand(1).getReg();
-    SrcSubIdx = CopyMI->getOperand(2).getImm();
-  } else if (isInsSubReg || isSubRegToReg) {
-    DstReg    = CopyMI->getOperand(0).getReg();
-    DstSubIdx = CopyMI->getOperand(3).getImm();
-    SrcReg    = CopyMI->getOperand(2).getReg();
-    SrcSubIdx = CopyMI->getOperand(2).getSubReg();
-    if (SrcSubIdx && SrcSubIdx != DstSubIdx) {
-      // r1025 = INSERT_SUBREG r1025, r1024<2>, 2 Then r1024 has already been
-      // coalesced to a larger register so the subreg indices cancel out.
-      DEBUG(dbgs() << "\tSource of insert_subreg or subreg_to_reg is already "
-                      "coalesced to another register.\n");
-      return false;  // Not coalescable.
-    }
-  } else if (tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
-    if (SrcSubIdx && DstSubIdx && SrcSubIdx != DstSubIdx) {
-      // e.g. %reg16404:1<def> = MOV8rr %reg16412:2<kill>
-      Again = true;
-      return false;  // Not coalescable.
-    }
-  } else {
-    llvm_unreachable("Unrecognized copy instruction!");
+  CoalescerPair CP(*tii_, *tri_);
+  if (!CP.setRegisters(CopyMI)) {
+    DEBUG(dbgs() << "\tNot coalescable.\n");
+    return false;
   }
 
   // If they are already joined we continue.
-  if (SrcReg == DstReg) {
+  if (CP.getSrcReg() == CP.getDstReg()) {
     DEBUG(dbgs() << "\tCopy already coalesced.\n");
     return false;  // Not coalescable.
   }
 
-  bool SrcIsPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg);
-  bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
-
-  // If they are both physical registers, we cannot join them.
-  if (SrcIsPhys && DstIsPhys) {
-    DEBUG(dbgs() << "\tCan not coalesce physregs.\n");
-    return false;  // Not coalescable.
-  }
-
-  // We only join virtual registers with allocatable physical registers.
-  if (SrcIsPhys && !allocatableRegs_[SrcReg]) {
-    DEBUG(dbgs() << "\tSrc reg is unallocatable physreg.\n");
-    return false;  // Not coalescable.
-  }
-  if (DstIsPhys && !allocatableRegs_[DstReg]) {
-    DEBUG(dbgs() << "\tDst reg is unallocatable physreg.\n");
-    return false;  // Not coalescable.
-  }
-
-  // We cannot handle dual subreg indices and mismatched classes at the same
-  // time.
-  if (SrcSubIdx && DstSubIdx && differingRegisterClasses(SrcReg, DstReg)) {
-    DEBUG(dbgs() << "\tCannot handle subreg indices and mismatched classes.\n");
-    return false;
-  }
+  DEBUG(dbgs() << "\tConsidering merging %reg" << CP.getSrcReg());
 
-  // Check that a physical source register is compatible with dst regclass
-  if (SrcIsPhys) {
-    unsigned SrcSubReg = SrcSubIdx ?
-      tri_->getSubReg(SrcReg, SrcSubIdx) : SrcReg;
-    const TargetRegisterClass *DstRC = mri_->getRegClass(DstReg);
-    const TargetRegisterClass *DstSubRC = DstRC;
-    if (DstSubIdx)
-      DstSubRC = DstRC->getSubRegisterRegClass(DstSubIdx);
-    assert(DstSubRC && "Illegal subregister index");
-    if (!DstSubRC->contains(SrcSubReg)) {
-      DEBUG(dbgs() << "\tIncompatible destination regclass: "
-                   << "none of the super-registers of "
-                   << tri_->getName(SrcSubReg) << " are in "
-                   << DstSubRC->getName() << ".\n");
-      return false;             // Not coalescable.
-    }
-  }
-
-  // Check that a physical dst register is compatible with source regclass
-  if (DstIsPhys) {
-    unsigned DstSubReg = DstSubIdx ?
-      tri_->getSubReg(DstReg, DstSubIdx) : DstReg;
-    const TargetRegisterClass *SrcRC = mri_->getRegClass(SrcReg);
-    const TargetRegisterClass *SrcSubRC = SrcRC;
-    if (SrcSubIdx)
-      SrcSubRC = SrcRC->getSubRegisterRegClass(SrcSubIdx);
-    assert(SrcSubRC && "Illegal subregister index");
-    if (!SrcSubRC->contains(DstSubReg)) {
-      DEBUG(dbgs() << "\tIncompatible source regclass: "
-                   << "none of the super-registers of "
-                   << tri_->getName(DstSubReg) << " are in "
-                   << SrcSubRC->getName() << ".\n");
-      (void)DstSubReg;
-      return false;             // Not coalescable.
+  // Enforce policies.
+  if (CP.isPhys()) {
+    DEBUG(dbgs() <<" with physreg %" << tri_->getName(CP.getDstReg()) << "\n");
+    // Only coalesce to allocatable physreg.
+    if (!allocatableRegs_[CP.getDstReg()]) {
+      DEBUG(dbgs() << "\tRegister is an unallocatable physreg.\n");
+      return false;  // Not coalescable.
     }
-  }
-
-  // Should be non-null only when coalescing to a sub-register class.
-  bool CrossRC = false;
-  const TargetRegisterClass *SrcRC= SrcIsPhys ? 0 : mri_->getRegClass(SrcReg);
-  const TargetRegisterClass *DstRC= DstIsPhys ? 0 : mri_->getRegClass(DstReg);
-  const TargetRegisterClass *NewRC = NULL;
-  unsigned RealDstReg = 0;
-  unsigned RealSrcReg = 0;
-  if (isExtSubReg || isInsSubReg || isSubRegToReg) {
-    SubIdx = CopyMI->getOperand(isExtSubReg ? 2 : 3).getImm();
-    if (SrcIsPhys && isExtSubReg) {
-      // r1024 = EXTRACT_SUBREG EAX, 0 then r1024 is really going to be
-      // coalesced with AX.
-      unsigned DstSubIdx = CopyMI->getOperand(0).getSubReg();
-      if (DstSubIdx) {
-        // r1024<2> = EXTRACT_SUBREG EAX, 2. Then r1024 has already been
-        // coalesced to a larger register so the subreg indices cancel out.
-        if (DstSubIdx != SubIdx) {
-          DEBUG(dbgs() << "\t Sub-register indices mismatch.\n");
-          return false; // Not coalescable.
-        }
-      } else
-        SrcReg = tri_->getSubReg(SrcReg, SubIdx);
-      SubIdx = 0;
-    } else if (DstIsPhys && (isInsSubReg || isSubRegToReg)) {
-      // EAX = INSERT_SUBREG EAX, r1024, 0
-      unsigned SrcSubIdx = CopyMI->getOperand(2).getSubReg();
-      if (SrcSubIdx) {
-        // EAX = INSERT_SUBREG EAX, r1024<2>, 2 Then r1024 has already been
-        // coalesced to a larger register so the subreg indices cancel out.
-        if (SrcSubIdx != SubIdx) {
-          DEBUG(dbgs() << "\t Sub-register indices mismatch.\n");
-          return false; // Not coalescable.
-        }
-      } else
-        DstReg = tri_->getSubReg(DstReg, SubIdx);
-      SubIdx = 0;
-    } else if ((DstIsPhys && isExtSubReg) ||
-               (SrcIsPhys && (isInsSubReg || isSubRegToReg))) {
-      if (!isSubRegToReg && CopyMI->getOperand(1).getSubReg()) {
-        DEBUG(dbgs() << "\tSrc of extract_subreg already coalesced with reg"
-                     << " of a super-class.\n");
-        return false; // Not coalescable.
-      }
-
-      // FIXME: The following checks are somewhat conservative. Perhaps a better
-      // way to implement this is to treat this as coalescing a vr with the
-      // super physical register.
-      if (isExtSubReg) {
-        if (!CanJoinExtractSubRegToPhysReg(DstReg, SrcReg, SubIdx, RealDstReg))
-          return false; // Not coalescable
-      } else {
-        if (!CanJoinInsertSubRegToPhysReg(DstReg, SrcReg, SubIdx, RealSrcReg))
-          return false; // Not coalescable
-      }
-      SubIdx = 0;
-    } else {
-      unsigned OldSubIdx = isExtSubReg ? CopyMI->getOperand(0).getSubReg()
-        : CopyMI->getOperand(2).getSubReg();
-      if (OldSubIdx) {
-        if (OldSubIdx == SubIdx && !differingRegisterClasses(SrcReg, DstReg))
-          // r1024<2> = EXTRACT_SUBREG r1025, 2. Then r1024 has already been
-          // coalesced to a larger register so the subreg indices cancel out.
-          // Also check if the other larger register is of the same register
-          // class as the would be resulting register.
-          SubIdx = 0;
-        else {
-          DEBUG(dbgs() << "\t Sub-register indices mismatch.\n");
-          return false; // Not coalescable.
-        }
-      }
-      if (SubIdx) {
-        if (!DstIsPhys && !SrcIsPhys) {
-          if (isInsSubReg || isSubRegToReg) {
-            NewRC = tri_->getMatchingSuperRegClass(DstRC, SrcRC, SubIdx);
-          } else // extract_subreg {
-            NewRC = tri_->getMatchingSuperRegClass(SrcRC, DstRC, SubIdx);
-          }
-        if (!NewRC) {
-          DEBUG(dbgs() << "\t Conflicting sub-register indices.\n");
-          return false;  // Not coalescable
-        }
+  } else {
+    DEBUG({
+      dbgs() << " with reg%" << CP.getDstReg();
+      if (CP.getSubIdx())
+        dbgs() << ":" << tri_->getSubRegIndexName(CP.getSubIdx());
+      dbgs() << " to " << CP.getNewRC()->getName() << "\n";
+    });
 
-        if (!isWinToJoinCrossClass(SrcReg, DstReg, SrcRC, DstRC, NewRC)) {
-          DEBUG(dbgs() << "\tAvoid coalescing to constrained register class: "
-                       << SrcRC->getName() << "/"
-                       << DstRC->getName() << " -> "
-                       << NewRC->getName() << ".\n");
-          Again = true;  // May be possible to coalesce later.
-          return false;
-        }
-      }
-    }
-  } else if (differingRegisterClasses(SrcReg, DstReg)) {
-    if (DisableCrossClassJoin)
-      return false;
-    CrossRC = true;
-
-    // FIXME: What if the result of a EXTRACT_SUBREG is then coalesced
-    // with another? If it's the resulting destination register, then
-    // the subidx must be propagated to uses (but only those defined
-    // by the EXTRACT_SUBREG). If it's being coalesced into another
-    // register, it should be safe because register is assumed to have
-    // the register class of the super-register.
-
-    // Process moves where one of the registers have a sub-register index.
-    MachineOperand *DstMO = CopyMI->findRegisterDefOperand(DstReg);
-    MachineOperand *SrcMO = CopyMI->findRegisterUseOperand(SrcReg);
-    SubIdx = DstMO->getSubReg();
-    if (SubIdx) {
-      if (SrcMO->getSubReg())
-        // FIXME: can we handle this?
+    // Avoid constraining virtual register regclass too much.
+    if (CP.isCrossClass()) {
+      if (DisableCrossClassJoin) {
+        DEBUG(dbgs() << "\tCross-class joins disabled.\n");
         return false;
-      // This is not an insert_subreg but it looks like one.
-      // e.g. %reg1024:4 = MOV32rr %EAX
-      isInsSubReg = true;
-      if (SrcIsPhys) {
-        if (!CanJoinInsertSubRegToPhysReg(DstReg, SrcReg, SubIdx, RealSrcReg))
-          return false; // Not coalescable
-        SubIdx = 0;
-      }
-    } else {
-      SubIdx = SrcMO->getSubReg();
-      if (SubIdx) {
-        // This is not a extract_subreg but it looks like one.
-        // e.g. %cl = MOV16rr %reg1024:1
-        isExtSubReg = true;
-        if (DstIsPhys) {
-          if (!CanJoinExtractSubRegToPhysReg(DstReg, SrcReg, SubIdx,RealDstReg))
-            return false; // Not coalescable
-          SubIdx = 0;
-        }
-      }
-    }
-
-    // Now determine the register class of the joined register.
-    if (!SrcIsPhys && !DstIsPhys) {
-      if (isExtSubReg) {
-        NewRC =
-          SubIdx ? tri_->getMatchingSuperRegClass(SrcRC, DstRC, SubIdx) : SrcRC;
-      } else if (isInsSubReg) {
-        NewRC =
-          SubIdx ? tri_->getMatchingSuperRegClass(DstRC, SrcRC, SubIdx) : DstRC;
-      } else {
-        NewRC = getCommonSubClass(SrcRC, DstRC);
-      }
-
-      if (!NewRC) {
-        DEBUG(dbgs() << "\tDisjoint regclasses: "
-                     << SrcRC->getName() << ", "
-                     << DstRC->getName() << ".\n");
-        return false;           // Not coalescable.
       }
-
-      // If we are joining two virtual registers and the resulting register
-      // class is more restrictive (fewer register, smaller size). Check if it's
-      // worth doing the merge.
-      if (!isWinToJoinCrossClass(SrcReg, DstReg, SrcRC, DstRC, NewRC)) {
+      if (!isWinToJoinCrossClass(CP.getSrcReg(), CP.getDstReg(),
+                                 mri_->getRegClass(CP.getSrcReg()),
+                                 mri_->getRegClass(CP.getDstReg()),
+                                 CP.getNewRC())) {
         DEBUG(dbgs() << "\tAvoid coalescing to constrained register class: "
-                     << SrcRC->getName() << "/"
-                     << DstRC->getName() << " -> "
-                     << NewRC->getName() << ".\n");
-        // Allow the coalescer to try again in case either side gets coalesced to
-        // a physical register that's compatible with the other side. e.g.
-        // r1024 = MOV32to32_ r1025
-        // But later r1024 is assigned EAX then r1025 may be coalesced with EAX.
+                     << CP.getNewRC()->getName() << ".\n");
         Again = true;  // May be possible to coalesce later.
         return false;
       }
     }
-  }
-
-  // Will it create illegal extract_subreg / insert_subreg?
-  if (SrcIsPhys && HasIncompatibleSubRegDefUse(CopyMI, DstReg, SrcReg))
-    return false;
-  if (DstIsPhys && HasIncompatibleSubRegDefUse(CopyMI, SrcReg, DstReg))
-    return false;
-
-  LiveInterval &SrcInt = li_->getInterval(SrcReg);
-  LiveInterval &DstInt = li_->getInterval(DstReg);
-  assert(SrcInt.reg == SrcReg && DstInt.reg == DstReg &&
-         "Register mapping is horribly broken!");
 
-  DEBUG({
-      dbgs() << "\t\tInspecting ";
-      if (SrcRC) dbgs() << SrcRC->getName() << ": ";
-      SrcInt.print(dbgs(), tri_);
-      dbgs() << "\n\t\t       and ";
-      if (DstRC) dbgs() << DstRC->getName() << ": ";
-      DstInt.print(dbgs(), tri_);
-      dbgs() << "\n";
-    });
+    // When possible, let DstReg be the larger interval.
+    if (!CP.getSubIdx() && li_->getInterval(CP.getSrcReg()).ranges.size() >
+                           li_->getInterval(CP.getDstReg()).ranges.size())
+      CP.flip();
+  }
+
+  // We need to be careful about coalescing a source physical register with a
+  // virtual register. Once the coalescing is done, it cannot be broken and
+  // these are not spillable! If the destination interval uses are far away,
+  // think twice about coalescing them!
+  // FIXME: Why are we skipping this test for partial copies?
+  //        CodeGen/X86/phys_subreg_coalesce-3.ll needs it.
+  if (!CP.isPartial() && CP.isPhys()) {
+    LiveInterval &JoinVInt = li_->getInterval(CP.getSrcReg());
+
+    // Don't join with physregs that have a ridiculous number of live
+    // ranges. The data structure performance is really bad when that
+    // happens.
+    if (li_->hasInterval(CP.getDstReg()) &&
+        li_->getInterval(CP.getDstReg()).ranges.size() > 1000) {
+      mri_->setRegAllocationHint(CP.getSrcReg(), 0, CP.getDstReg());
+      ++numAborts;
+      DEBUG(dbgs()
+           << "\tPhysical register live interval too complicated, abort!\n");
+      return false;
+    }
 
-  // Save a copy of the virtual register live interval. We'll manually
-  // merge this into the "real" physical register live interval this is
-  // coalesced with.
-  OwningPtr<LiveInterval> SavedLI;
-  if (RealDstReg)
-    SavedLI.reset(li_->dupInterval(&SrcInt));
-  else if (RealSrcReg)
-    SavedLI.reset(li_->dupInterval(&DstInt));
-
-  if (!isExtSubReg && !isInsSubReg && !isSubRegToReg) {
-    // Check if it is necessary to propagate "isDead" property.
-    MachineOperand *mopd = CopyMI->findRegisterDefOperand(DstReg, false);
-    bool isDead = mopd->isDead();
-
-    // We need to be careful about coalescing a source physical register with a
-    // virtual register. Once the coalescing is done, it cannot be broken and
-    // these are not spillable! If the destination interval uses are far away,
-    // think twice about coalescing them!
-    if (!isDead && (SrcIsPhys || DstIsPhys)) {
-      // If the virtual register live interval is long but it has low use
-      // density, do not join them, instead mark the physical register as its
-      // allocation preference.
-      LiveInterval &JoinVInt = SrcIsPhys ? DstInt : SrcInt;
-      LiveInterval &JoinPInt = SrcIsPhys ? SrcInt : DstInt;
-      unsigned JoinVReg = SrcIsPhys ? DstReg : SrcReg;
-      unsigned JoinPReg = SrcIsPhys ? SrcReg : DstReg;
-
-      // Don't join with physregs that have a ridiculous number of live
-      // ranges. The data structure performance is really bad when that
-      // happens.
-      if (JoinPInt.ranges.size() > 1000) {
-        mri_->setRegAllocationHint(JoinVInt.reg, 0, JoinPReg);
-        ++numAborts;
-        DEBUG(dbgs()
-              << "\tPhysical register live interval too complicated, abort!\n");
-        return false;
-      }
+    const TargetRegisterClass *RC = mri_->getRegClass(CP.getSrcReg());
+    unsigned Threshold = allocatableRCRegs_[RC].count() * 2;
+    unsigned Length = li_->getApproximateInstructionCount(JoinVInt);
+    if (Length > Threshold &&
+        std::distance(mri_->use_nodbg_begin(CP.getSrcReg()),
+                      mri_->use_nodbg_end()) * Threshold < Length) {
+      // Before giving up coalescing, if definition of source is defined by
+      // trivial computation, try rematerializing it.
+      if (!CP.isFlipped() &&
+          ReMaterializeTrivialDef(JoinVInt, CP.getDstReg(), 0, CopyMI))
+        return true;
 
-      const TargetRegisterClass *RC = mri_->getRegClass(JoinVReg);
-      unsigned Threshold = allocatableRCRegs_[RC].count() * 2;
-      unsigned Length = li_->getApproximateInstructionCount(JoinVInt);
-      if (Length > Threshold &&
-          std::distance(mri_->use_nodbg_begin(JoinVReg),
-                        mri_->use_nodbg_end()) * Threshold < Length) {
-        // Before giving up coalescing, if definition of source is defined by
-        // trivial computation, try rematerializing it.
-        if (ReMaterializeTrivialDef(SrcInt, DstReg, DstSubIdx, CopyMI))
-          return true;
-
-        mri_->setRegAllocationHint(JoinVInt.reg, 0, JoinPReg);
-        ++numAborts;
-        DEBUG(dbgs() << "\tMay tie down a physical register, abort!\n");
-        Again = true;  // May be possible to coalesce later.
-        return false;
-      }
+      mri_->setRegAllocationHint(CP.getSrcReg(), 0, CP.getDstReg());
+      ++numAborts;
+      DEBUG(dbgs() << "\tMay tie down a physical register, abort!\n");
+      Again = true;  // May be possible to coalesce later.
+      return false;
     }
   }
 
@@ -1724,32 +1168,24 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
   // Otherwise, if one of the intervals being joined is a physreg, this method
   // always canonicalizes DstInt to be it.  The output "SrcInt" will not have
   // been modified, so we can use this information below to update aliases.
-  bool Swapped = false;
-  // If SrcInt is implicitly defined, it's safe to coalesce.
-  if (SrcInt.empty()) {
-    if (!CanCoalesceWithImpDef(CopyMI, DstInt, SrcInt)) {
-      // Only coalesce an empty interval (defined by implicit_def) with
-      // another interval which has a valno defined by the CopyMI and the CopyMI
-      // is a kill of the implicit def.
-      DEBUG(dbgs() << "\tNot profitable!\n");
-      return false;
-    }
-  } else if (!JoinIntervals(DstInt, SrcInt, Swapped)) {
+  if (!JoinIntervals(CP)) {
     // Coalescing failed.
 
     // If definition of source is defined by trivial computation, try
     // rematerializing it.
-    if (!isExtSubReg && !isInsSubReg && !isSubRegToReg &&
-        ReMaterializeTrivialDef(SrcInt, DstReg, DstSubIdx, CopyMI))
+    if (!CP.isFlipped() &&
+        ReMaterializeTrivialDef(li_->getInterval(CP.getSrcReg()),
+                                CP.getDstReg(), 0, CopyMI))
       return true;
 
     // If we can eliminate the copy without merging the live ranges, do so now.
-    if (!isExtSubReg && !isInsSubReg && !isSubRegToReg &&
-        (AdjustCopiesBackFrom(SrcInt, DstInt, CopyMI) ||
-         RemoveCopyByCommutingDef(SrcInt, DstInt, CopyMI))) {
-      JoinedCopies.insert(CopyMI);
-      DEBUG(dbgs() << "\tTrivial!\n");
-      return true;
+    if (!CP.isPartial()) {
+      if (AdjustCopiesBackFrom(CP, CopyMI) ||
+          RemoveCopyByCommutingDef(CP, CopyMI)) {
+        JoinedCopies.insert(CopyMI);
+        DEBUG(dbgs() << "\tTrivial!\n");
+        return true;
+      }
     }
 
     // Otherwise, we are unable to join the intervals.
@@ -1758,86 +1194,32 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
     return false;
   }
 
-  LiveInterval *ResSrcInt = &SrcInt;
-  LiveInterval *ResDstInt = &DstInt;
-  if (Swapped) {
-    std::swap(SrcReg, DstReg);
-    std::swap(ResSrcInt, ResDstInt);
-  }
-  assert(TargetRegisterInfo::isVirtualRegister(SrcReg) &&
-         "LiveInterval::join didn't work right!");
-
-  // If we're about to merge live ranges into a physical register live interval,
-  // we have to update any aliased register's live ranges to indicate that they
-  // have clobbered values for this range.
-  if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
-    // If this is a extract_subreg where dst is a physical register, e.g.
-    // cl = EXTRACT_SUBREG reg1024, 1
-    // then create and update the actual physical register allocated to RHS.
-    if (RealDstReg || RealSrcReg) {
-      LiveInterval &RealInt =
-        li_->getOrCreateInterval(RealDstReg ? RealDstReg : RealSrcReg);
-      for (LiveInterval::const_vni_iterator I = SavedLI->vni_begin(),
-             E = SavedLI->vni_end(); I != E; ++I) {
-        const VNInfo *ValNo = *I;
-        VNInfo *NewValNo = RealInt.getNextValue(ValNo->def, ValNo->getCopy(),
-                                                false, // updated at *
-                                                li_->getVNInfoAllocator());
-        NewValNo->setFlags(ValNo->getFlags()); // * updated here.
-        RealInt.addKills(NewValNo, ValNo->kills);
-        RealInt.MergeValueInAsValue(*SavedLI, ValNo, NewValNo);
-      }
-      RealInt.weight += SavedLI->weight;
-      DstReg = RealDstReg ? RealDstReg : RealSrcReg;
-    }
-
-    // Update the liveintervals of sub-registers.
-    for (const unsigned *AS = tri_->getSubRegisters(DstReg); *AS; ++AS)
-      li_->getOrCreateInterval(*AS).MergeInClobberRanges(*li_, *ResSrcInt,
-                                                 li_->getVNInfoAllocator());
-  }
-
-  // If this is a EXTRACT_SUBREG, make sure the result of coalescing is the
-  // larger super-register.
-  if ((isExtSubReg || isInsSubReg || isSubRegToReg) &&
-      !SrcIsPhys && !DstIsPhys) {
-    if ((isExtSubReg && !Swapped) ||
-        ((isInsSubReg || isSubRegToReg) && Swapped)) {
-      ResSrcInt->Copy(*ResDstInt, mri_, li_->getVNInfoAllocator());
-      std::swap(SrcReg, DstReg);
-      std::swap(ResSrcInt, ResDstInt);
-    }
-  }
-
   // Coalescing to a virtual register that is of a sub-register class of the
   // other. Make sure the resulting register is set to the right register class.
-  if (CrossRC)
+  if (CP.isCrossClass()) {
     ++numCrossRCs;
-
-  // This may happen even if it's cross-rc coalescing. e.g.
-  // %reg1026<def> = SUBREG_TO_REG 0, %reg1037<kill>, 4
-  // reg1026 -> GR64, reg1037 -> GR32_ABCD. The resulting register will have to
-  // be allocate a register from GR64_ABCD.
-  if (NewRC)
-    mri_->setRegClass(DstReg, NewRC);
+    mri_->setRegClass(CP.getDstReg(), CP.getNewRC());
+  }
 
   // Remember to delete the copy instruction.
   JoinedCopies.insert(CopyMI);
 
-  UpdateRegDefsUses(SrcReg, DstReg, SubIdx);
+  UpdateRegDefsUses(CP);
 
   // If we have extended the live range of a physical register, make sure we
   // update live-in lists as well.
-  if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
-    const LiveInterval &VRegInterval = li_->getInterval(SrcReg);
+  if (CP.isPhys()) {
     SmallVector<MachineBasicBlock*, 16> BlockSeq;
-    for (LiveInterval::const_iterator I = VRegInterval.begin(),
-           E = VRegInterval.end(); I != E; ++I ) {
+    // JoinIntervals invalidates the VNInfos in SrcInt, but we only need the
+    // ranges for this, and they are preserved.
+    LiveInterval &SrcInt = li_->getInterval(CP.getSrcReg());
+    for (LiveInterval::const_iterator I = SrcInt.begin(), E = SrcInt.end();
+         I != E; ++I ) {
       li_->findLiveInMBBs(I->start, I->end, BlockSeq);
       for (unsigned idx = 0, size = BlockSeq.size(); idx != size; ++idx) {
         MachineBasicBlock &block = *BlockSeq[idx];
-        if (!block.isLiveIn(DstReg))
-          block.addLiveIn(DstReg);
+        if (!block.isLiveIn(CP.getDstReg()))
+          block.addLiveIn(CP.getDstReg());
       }
       BlockSeq.clear();
     }
@@ -1845,32 +1227,17 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
 
   // SrcReg is guarateed to be the register whose live interval that is
   // being merged.
-  li_->removeInterval(SrcReg);
+  li_->removeInterval(CP.getSrcReg());
 
   // Update regalloc hint.
-  tri_->UpdateRegAllocHint(SrcReg, DstReg, *mf_);
-
-  // Manually deleted the live interval copy.
-  if (SavedLI) {
-    SavedLI->clear();
-    SavedLI.reset();
-  }
-
-  // If resulting interval has a preference that no longer fits because of subreg
-  // coalescing, just clear the preference.
-  unsigned Preference = getRegAllocPreference(ResDstInt->reg, *mf_, mri_, tri_);
-  if (Preference && (isExtSubReg || isInsSubReg || isSubRegToReg) &&
-      TargetRegisterInfo::isVirtualRegister(ResDstInt->reg)) {
-    const TargetRegisterClass *RC = mri_->getRegClass(ResDstInt->reg);
-    if (!RC->contains(Preference))
-      mri_->setRegAllocationHint(ResDstInt->reg, 0, 0);
-  }
+  tri_->UpdateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *mf_);
 
   DEBUG({
-      dbgs() << "\t\tJoined. Result = ";
-      ResDstInt->print(dbgs(), tri_);
-      dbgs() << "\n";
-    });
+    LiveInterval &DstInt = li_->getInterval(CP.getDstReg());
+    dbgs() << "\tJoined. Result = ";
+    DstInt.print(dbgs(), tri_);
+    dbgs() << "\n";
+  });
 
   ++numJoins;
   return true;
@@ -1927,263 +1294,53 @@ static unsigned ComputeUltimateVN(VNInfo *VNI,
   return ThisValNoAssignments[VN] = UltimateVN;
 }
 
-static bool InVector(VNInfo *Val, const SmallVector<VNInfo*, 8> &V) {
-  return std::find(V.begin(), V.end(), Val) != V.end();
-}
-
-static bool isValNoDefMove(const MachineInstr *MI, unsigned DR, unsigned SR,
-                           const TargetInstrInfo *TII,
-                           const TargetRegisterInfo *TRI) {
-  unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-  if (TII->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx))
-    ;
-  else if (MI->isExtractSubreg()) {
-    DstReg = MI->getOperand(0).getReg();
-    SrcReg = MI->getOperand(1).getReg();
-  } else if (MI->isSubregToReg() ||
-             MI->isInsertSubreg()) {
-    DstReg = MI->getOperand(0).getReg();
-    SrcReg = MI->getOperand(2).getReg();
-  } else
-    return false;
-  return (SrcReg == SR || TRI->isSuperRegister(SR, SrcReg)) &&
-         (DstReg == DR || TRI->isSuperRegister(DR, DstReg));
-}
-
-/// RangeIsDefinedByCopyFromReg - Return true if the specified live range of
-/// the specified live interval is defined by a copy from the specified
-/// register.
-bool SimpleRegisterCoalescing::RangeIsDefinedByCopyFromReg(LiveInterval &li,
-                                                           LiveRange *LR,
-                                                           unsigned Reg) {
-  unsigned SrcReg = li_->getVNInfoSourceReg(LR->valno);
-  if (SrcReg == Reg)
-    return true;
-  // FIXME: Do isPHIDef and isDefAccurate both need to be tested?
-  if ((LR->valno->isPHIDef() || !LR->valno->isDefAccurate()) &&
-      TargetRegisterInfo::isPhysicalRegister(li.reg) &&
-      *tri_->getSuperRegisters(li.reg)) {
-    // It's a sub-register live interval, we may not have precise information.
-    // Re-compute it.
-    MachineInstr *DefMI = li_->getInstructionFromIndex(LR->start);
-    if (DefMI && isValNoDefMove(DefMI, li.reg, Reg, tii_, tri_)) {
-      // Cache computed info.
-      LR->valno->def = LR->start;
-      LR->valno->setCopy(DefMI);
-      return true;
-    }
-  }
-  return false;
-}
-
-
-/// ValueLiveAt - Return true if the LiveRange pointed to by the given
-/// iterator, or any subsequent range with the same value number,
-/// is live at the given point.
-bool SimpleRegisterCoalescing::ValueLiveAt(LiveInterval::iterator LRItr,
-                                           LiveInterval::iterator LREnd,
-                                           SlotIndex defPoint) const {
-  for (const VNInfo *valno = LRItr->valno;
-       (LRItr != LREnd) && (LRItr->valno == valno); ++LRItr) {
-    if (LRItr->contains(defPoint))
-      return true;
-  }
-
-  return false;
-}
-
-
-/// SimpleJoin - Attempt to joint the specified interval into this one. The
-/// caller of this method must guarantee that the RHS only contains a single
-/// value number and that the RHS is not defined by a copy from this
-/// interval.  This returns false if the intervals are not joinable, or it
-/// joins them and returns true.
-bool SimpleRegisterCoalescing::SimpleJoin(LiveInterval &LHS, LiveInterval &RHS){
-  assert(RHS.containsOneValue());
-
-  // Some number (potentially more than one) value numbers in the current
-  // interval may be defined as copies from the RHS.  Scan the overlapping
-  // portions of the LHS and RHS, keeping track of this and looking for
-  // overlapping live ranges that are NOT defined as copies.  If these exist, we
-  // cannot coalesce.
-
-  LiveInterval::iterator LHSIt = LHS.begin(), LHSEnd = LHS.end();
-  LiveInterval::iterator RHSIt = RHS.begin(), RHSEnd = RHS.end();
-
-  if (LHSIt->start < RHSIt->start) {
-    LHSIt = std::upper_bound(LHSIt, LHSEnd, RHSIt->start);
-    if (LHSIt != LHS.begin()) --LHSIt;
-  } else if (RHSIt->start < LHSIt->start) {
-    RHSIt = std::upper_bound(RHSIt, RHSEnd, LHSIt->start);
-    if (RHSIt != RHS.begin()) --RHSIt;
-  }
-
-  SmallVector<VNInfo*, 8> EliminatedLHSVals;
-
-  while (1) {
-    // Determine if these live intervals overlap.
-    bool Overlaps = false;
-    if (LHSIt->start <= RHSIt->start)
-      Overlaps = LHSIt->end > RHSIt->start;
-    else
-      Overlaps = RHSIt->end > LHSIt->start;
-
-    // If the live intervals overlap, there are two interesting cases: if the
-    // LHS interval is defined by a copy from the RHS, it's ok and we record
-    // that the LHS value # is the same as the RHS.  If it's not, then we cannot
-    // coalesce these live ranges and we bail out.
-    if (Overlaps) {
-      // If we haven't already recorded that this value # is safe, check it.
-      if (!InVector(LHSIt->valno, EliminatedLHSVals)) {
-        // If it's re-defined by an early clobber somewhere in the live range,
-        // then conservatively abort coalescing.
-        if (LHSIt->valno->hasRedefByEC())
-          return false;
-        // Copy from the RHS?
-        if (!RangeIsDefinedByCopyFromReg(LHS, LHSIt, RHS.reg))
-          return false;    // Nope, bail out.
-
-        if (ValueLiveAt(LHSIt, LHS.end(), RHSIt->valno->def))
-          // Here is an interesting situation:
-          // BB1:
-          //   vr1025 = copy vr1024
-          //   ..
-          // BB2:
-          //   vr1024 = op
-          //          = vr1025
-          // Even though vr1025 is copied from vr1024, it's not safe to
-          // coalesce them since the live range of vr1025 intersects the
-          // def of vr1024. This happens because vr1025 is assigned the
-          // value of the previous iteration of vr1024.
+/// JoinIntervals - Attempt to join these two intervals.  On failure, this
+/// returns false.
+bool SimpleRegisterCoalescing::JoinIntervals(CoalescerPair &CP) {
+  LiveInterval &RHS = li_->getInterval(CP.getSrcReg());
+  DEBUG({ dbgs() << "\t\tRHS = "; RHS.print(dbgs(), tri_); dbgs() << "\n"; });
+
+  // If a live interval is a physical register, check for interference with any
+  // aliases. The interference check implemented here is a bit more conservative
+  // than the full interfeence check below. We allow overlapping live ranges
+  // only when one is a copy of the other.
+  if (CP.isPhys()) {
+    for (const unsigned *AS = tri_->getAliasSet(CP.getDstReg()); *AS; ++AS){
+      if (!li_->hasInterval(*AS))
+        continue;
+      const LiveInterval &LHS = li_->getInterval(*AS);
+      LiveInterval::const_iterator LI = LHS.begin();
+      for (LiveInterval::const_iterator RI = RHS.begin(), RE = RHS.end();
+           RI != RE; ++RI) {
+        LI = std::lower_bound(LI, LHS.end(), RI->start);
+        // Does LHS have an overlapping live range starting before RI?
+        if ((LI != LHS.begin() && LI[-1].end > RI->start) &&
+            (RI->start != RI->valno->def ||
+             !CP.isCoalescable(li_->getInstructionFromIndex(RI->start)))) {
+          DEBUG({
+            dbgs() << "\t\tInterference from alias: ";
+            LHS.print(dbgs(), tri_);
+            dbgs() << "\n\t\tOverlap at " << RI->start << " and no copy.\n";
+          });
           return false;
-        EliminatedLHSVals.push_back(LHSIt->valno);
-      }
-
-      // We know this entire LHS live range is okay, so skip it now.
-      if (++LHSIt == LHSEnd) break;
-      continue;
-    }
+        }
 
-    if (LHSIt->end < RHSIt->end) {
-      if (++LHSIt == LHSEnd) break;
-    } else {
-      // One interesting case to check here.  It's possible that we have
-      // something like "X3 = Y" which defines a new value number in the LHS,
-      // and is the last use of this liverange of the RHS.  In this case, we
-      // want to notice this copy (so that it gets coalesced away) even though
-      // the live ranges don't actually overlap.
-      if (LHSIt->start == RHSIt->end) {
-        if (InVector(LHSIt->valno, EliminatedLHSVals)) {
-          // We already know that this value number is going to be merged in
-          // if coalescing succeeds.  Just skip the liverange.
-          if (++LHSIt == LHSEnd) break;
-        } else {
-          // If it's re-defined by an early clobber somewhere in the live range,
-          // then conservatively abort coalescing.
-          if (LHSIt->valno->hasRedefByEC())
+        // Check that LHS ranges beginning in this range are copies.
+        for (; LI != LHS.end() && LI->start < RI->end; ++LI) {
+          if (LI->start != LI->valno->def ||
+              !CP.isCoalescable(li_->getInstructionFromIndex(LI->start))) {
+            DEBUG({
+              dbgs() << "\t\tInterference from alias: ";
+              LHS.print(dbgs(), tri_);
+              dbgs() << "\n\t\tDef at " << LI->start << " is not a copy.\n";
+            });
             return false;
-          // Otherwise, if this is a copy from the RHS, mark it as being merged
-          // in.
-          if (RangeIsDefinedByCopyFromReg(LHS, LHSIt, RHS.reg)) {
-            if (ValueLiveAt(LHSIt, LHS.end(), RHSIt->valno->def))
-              // Here is an interesting situation:
-              // BB1:
-              //   vr1025 = copy vr1024
-              //   ..
-              // BB2:
-              //   vr1024 = op
-              //          = vr1025
-              // Even though vr1025 is copied from vr1024, it's not safe to
-              // coalesced them since live range of vr1025 intersects the
-              // def of vr1024. This happens because vr1025 is assigned the
-              // value of the previous iteration of vr1024.
-              return false;
-            EliminatedLHSVals.push_back(LHSIt->valno);
-
-            // We know this entire LHS live range is okay, so skip it now.
-            if (++LHSIt == LHSEnd) break;
           }
         }
       }
-
-      if (++RHSIt == RHSEnd) break;
-    }
-  }
-
-  // If we got here, we know that the coalescing will be successful and that
-  // the value numbers in EliminatedLHSVals will all be merged together.  Since
-  // the most common case is that EliminatedLHSVals has a single number, we
-  // optimize for it: if there is more than one value, we merge them all into
-  // the lowest numbered one, then handle the interval as if we were merging
-  // with one value number.
-  VNInfo *LHSValNo = NULL;
-  if (EliminatedLHSVals.size() > 1) {
-    // Loop through all the equal value numbers merging them into the smallest
-    // one.
-    VNInfo *Smallest = EliminatedLHSVals[0];
-    for (unsigned i = 1, e = EliminatedLHSVals.size(); i != e; ++i) {
-      if (EliminatedLHSVals[i]->id < Smallest->id) {
-        // Merge the current notion of the smallest into the smaller one.
-        LHS.MergeValueNumberInto(Smallest, EliminatedLHSVals[i]);
-        Smallest = EliminatedLHSVals[i];
-      } else {
-        // Merge into the smallest.
-        LHS.MergeValueNumberInto(EliminatedLHSVals[i], Smallest);
-      }
     }
-    LHSValNo = Smallest;
-  } else if (EliminatedLHSVals.empty()) {
-    if (TargetRegisterInfo::isPhysicalRegister(LHS.reg) &&
-        *tri_->getSuperRegisters(LHS.reg))
-      // Imprecise sub-register information. Can't handle it.
-      return false;
-    llvm_unreachable("No copies from the RHS?");
-  } else {
-    LHSValNo = EliminatedLHSVals[0];
-  }
-
-  // Okay, now that there is a single LHS value number that we're merging the
-  // RHS into, update the value number info for the LHS to indicate that the
-  // value number is defined where the RHS value number was.
-  const VNInfo *VNI = RHS.getValNumInfo(0);
-  LHSValNo->def  = VNI->def;
-  LHSValNo->setCopy(VNI->getCopy());
-
-  // Okay, the final step is to loop over the RHS live intervals, adding them to
-  // the LHS.
-  if (VNI->hasPHIKill())
-    LHSValNo->setHasPHIKill(true);
-  LHS.addKills(LHSValNo, VNI->kills);
-  LHS.MergeRangesInAsValue(RHS, LHSValNo);
-
-  LHS.ComputeJoinedWeight(RHS);
-
-  // Update regalloc hint if both are virtual registers.
-  if (TargetRegisterInfo::isVirtualRegister(LHS.reg) &&
-      TargetRegisterInfo::isVirtualRegister(RHS.reg)) {
-    std::pair<unsigned, unsigned> RHSPref = mri_->getRegAllocationHint(RHS.reg);
-    std::pair<unsigned, unsigned> LHSPref = mri_->getRegAllocationHint(LHS.reg);
-    if (RHSPref != LHSPref)
-      mri_->setRegAllocationHint(LHS.reg, RHSPref.first, RHSPref.second);
   }
 
-  // Update the liveintervals of sub-registers.
-  if (TargetRegisterInfo::isPhysicalRegister(LHS.reg))
-    for (const unsigned *AS = tri_->getSubRegisters(LHS.reg); *AS; ++AS)
-      li_->getOrCreateInterval(*AS).MergeInClobberRanges(*li_, LHS,
-                                                    li_->getVNInfoAllocator());
-
-  return true;
-}
-
-/// JoinIntervals - Attempt to join these two intervals.  On failure, this
-/// returns false.  Otherwise, if one of the intervals being joined is a
-/// physreg, this method always canonicalizes LHS to be it.  The output
-/// "RHS" will not have been modified, so we can use this information
-/// below to update aliases.
-bool
-SimpleRegisterCoalescing::JoinIntervals(LiveInterval &LHS, LiveInterval &RHS,
-                                        bool &Swapped) {
   // Compute the final value assignment, assuming that the live ranges can be
   // coalesced.
   SmallVector<int, 16> LHSValNoAssignments;
@@ -2192,203 +1349,87 @@ SimpleRegisterCoalescing::JoinIntervals(LiveInterval &LHS, LiveInterval &RHS,
   DenseMap<VNInfo*, VNInfo*> RHSValsDefinedFromLHS;
   SmallVector<VNInfo*, 16> NewVNInfo;
 
-  // If a live interval is a physical register, conservatively check if any
-  // of its sub-registers is overlapping the live interval of the virtual
-  // register. If so, do not coalesce.
-  if (TargetRegisterInfo::isPhysicalRegister(LHS.reg) &&
-      *tri_->getSubRegisters(LHS.reg)) {
-    // If it's coalescing a virtual register to a physical register, estimate
-    // its live interval length. This is the *cost* of scanning an entire live
-    // interval. If the cost is low, we'll do an exhaustive check instead.
-
-    // If this is something like this:
-    // BB1:
-    // v1024 = op
-    // ...
-    // BB2:
-    // ...
-    // RAX   = v1024
-    //
-    // That is, the live interval of v1024 crosses a bb. Then we can't rely on
-    // less conservative check. It's possible a sub-register is defined before
-    // v1024 (or live in) and live out of BB1.
-    if (RHS.containsOneValue() &&
-        li_->intervalIsInOneMBB(RHS) &&
-        li_->getApproximateInstructionCount(RHS) <= 10) {
-      // Perform a more exhaustive check for some common cases.
-      if (li_->conflictsWithSubPhysRegRef(RHS, LHS.reg, true, JoinedCopies))
-        return false;
-    } else {
-      for (const unsigned* SR = tri_->getSubRegisters(LHS.reg); *SR; ++SR)
-        if (li_->hasInterval(*SR) && RHS.overlaps(li_->getInterval(*SR))) {
-          DEBUG({
-              dbgs() << "\tInterfere with sub-register ";
-              li_->getInterval(*SR).print(dbgs(), tri_);
-            });
-          return false;
-        }
-    }
-  } else if (TargetRegisterInfo::isPhysicalRegister(RHS.reg) &&
-             *tri_->getSubRegisters(RHS.reg)) {
-    if (LHS.containsOneValue() &&
-        li_->getApproximateInstructionCount(LHS) <= 10) {
-      // Perform a more exhaustive check for some common cases.
-      if (li_->conflictsWithSubPhysRegRef(LHS, RHS.reg, false, JoinedCopies))
-        return false;
-    } else {
-      for (const unsigned* SR = tri_->getSubRegisters(RHS.reg); *SR; ++SR)
-        if (li_->hasInterval(*SR) && LHS.overlaps(li_->getInterval(*SR))) {
-          DEBUG({
-              dbgs() << "\tInterfere with sub-register ";
-              li_->getInterval(*SR).print(dbgs(), tri_);
-            });
-          return false;
-        }
-    }
-  }
+  LiveInterval &LHS = li_->getOrCreateInterval(CP.getDstReg());
+  DEBUG({ dbgs() << "\t\tLHS = "; LHS.print(dbgs(), tri_); dbgs() << "\n"; });
 
-  // Compute ultimate value numbers for the LHS and RHS values.
-  if (RHS.containsOneValue()) {
-    // Copies from a liveinterval with a single value are simple to handle and
-    // very common, handle the special case here.  This is important, because
-    // often RHS is small and LHS is large (e.g. a physreg).
-
-    // Find out if the RHS is defined as a copy from some value in the LHS.
-    int RHSVal0DefinedFromLHS = -1;
-    int RHSValID = -1;
-    VNInfo *RHSValNoInfo = NULL;
-    VNInfo *RHSValNoInfo0 = RHS.getValNumInfo(0);
-    unsigned RHSSrcReg = li_->getVNInfoSourceReg(RHSValNoInfo0);
-    if (RHSSrcReg == 0 || RHSSrcReg != LHS.reg) {
-      // If RHS is not defined as a copy from the LHS, we can use simpler and
-      // faster checks to see if the live ranges are coalescable.  This joiner
-      // can't swap the LHS/RHS intervals though.
-      if (!TargetRegisterInfo::isPhysicalRegister(RHS.reg)) {
-        return SimpleJoin(LHS, RHS);
-      } else {
-        RHSValNoInfo = RHSValNoInfo0;
-      }
-    } else {
-      // It was defined as a copy from the LHS, find out what value # it is.
-      RHSValNoInfo =
-        LHS.getLiveRangeContaining(RHSValNoInfo0->def.getPrevSlot())->valno;
-      RHSValID = RHSValNoInfo->id;
-      RHSVal0DefinedFromLHS = RHSValID;
-    }
+  // Loop over the value numbers of the LHS, seeing if any are defined from
+  // the RHS.
+  for (LiveInterval::vni_iterator i = LHS.vni_begin(), e = LHS.vni_end();
+       i != e; ++i) {
+    VNInfo *VNI = *i;
+    if (VNI->isUnused() || VNI->getCopy() == 0)  // Src not defined by a copy?
+      continue;
 
-    LHSValNoAssignments.resize(LHS.getNumValNums(), -1);
-    RHSValNoAssignments.resize(RHS.getNumValNums(), -1);
-    NewVNInfo.resize(LHS.getNumValNums(), NULL);
-
-    // Okay, *all* of the values in LHS that are defined as a copy from RHS
-    // should now get updated.
-    for (LiveInterval::vni_iterator i = LHS.vni_begin(), e = LHS.vni_end();
-         i != e; ++i) {
-      VNInfo *VNI = *i;
-      unsigned VN = VNI->id;
-      if (unsigned LHSSrcReg = li_->getVNInfoSourceReg(VNI)) {
-        if (LHSSrcReg != RHS.reg) {
-          // If this is not a copy from the RHS, its value number will be
-          // unmodified by the coalescing.
-          NewVNInfo[VN] = VNI;
-          LHSValNoAssignments[VN] = VN;
-        } else if (RHSValID == -1) {
-          // Otherwise, it is a copy from the RHS, and we don't already have a
-          // value# for it.  Keep the current value number, but remember it.
-          LHSValNoAssignments[VN] = RHSValID = VN;
-          NewVNInfo[VN] = RHSValNoInfo;
-          LHSValsDefinedFromRHS[VNI] = RHSValNoInfo0;
-        } else {
-          // Otherwise, use the specified value #.
-          LHSValNoAssignments[VN] = RHSValID;
-          if (VN == (unsigned)RHSValID) {  // Else this val# is dead.
-            NewVNInfo[VN] = RHSValNoInfo;
-            LHSValsDefinedFromRHS[VNI] = RHSValNoInfo0;
-          }
-        }
-      } else {
-        NewVNInfo[VN] = VNI;
-        LHSValNoAssignments[VN] = VN;
-      }
-    }
+    // Never join with a register that has EarlyClobber redefs.
+    if (VNI->hasRedefByEC())
+      return false;
 
-    assert(RHSValID != -1 && "Didn't find value #?");
-    RHSValNoAssignments[0] = RHSValID;
-    if (RHSVal0DefinedFromLHS != -1) {
-      // This path doesn't go through ComputeUltimateVN so just set
-      // it to anything.
-      RHSValsDefinedFromLHS[RHSValNoInfo0] = (VNInfo*)1;
-    }
-  } else {
-    // Loop over the value numbers of the LHS, seeing if any are defined from
-    // the RHS.
-    for (LiveInterval::vni_iterator i = LHS.vni_begin(), e = LHS.vni_end();
-         i != e; ++i) {
-      VNInfo *VNI = *i;
-      if (VNI->isUnused() || VNI->getCopy() == 0)  // Src not defined by a copy?
-        continue;
+    // DstReg is known to be a register in the LHS interval.  If the src is
+    // from the RHS interval, we can use its value #.
+    if (!CP.isCoalescable(VNI->getCopy()))
+      continue;
 
-      // DstReg is known to be a register in the LHS interval.  If the src is
-      // from the RHS interval, we can use its value #.
-      if (li_->getVNInfoSourceReg(VNI) != RHS.reg)
-        continue;
+    // Figure out the value # from the RHS.
+    LiveRange *lr = RHS.getLiveRangeContaining(VNI->def.getPrevSlot());
+    // The copy could be to an aliased physreg.
+    if (!lr) continue;
+    LHSValsDefinedFromRHS[VNI] = lr->valno;
+  }
 
-      // Figure out the value # from the RHS.
-      LiveRange *lr = RHS.getLiveRangeContaining(VNI->def.getPrevSlot());
-      assert(lr && "Cannot find live range");
-      LHSValsDefinedFromRHS[VNI] = lr->valno;
-    }
+  // Loop over the value numbers of the RHS, seeing if any are defined from
+  // the LHS.
+  for (LiveInterval::vni_iterator i = RHS.vni_begin(), e = RHS.vni_end();
+       i != e; ++i) {
+    VNInfo *VNI = *i;
+    if (VNI->isUnused() || VNI->getCopy() == 0)  // Src not defined by a copy?
+      continue;
 
-    // Loop over the value numbers of the RHS, seeing if any are defined from
-    // the LHS.
-    for (LiveInterval::vni_iterator i = RHS.vni_begin(), e = RHS.vni_end();
-         i != e; ++i) {
-      VNInfo *VNI = *i;
-      if (VNI->isUnused() || VNI->getCopy() == 0)  // Src not defined by a copy?
-        continue;
+    // Never join with a register that has EarlyClobber redefs.
+    if (VNI->hasRedefByEC())
+      return false;
 
-      // DstReg is known to be a register in the RHS interval.  If the src is
-      // from the LHS interval, we can use its value #.
-      if (li_->getVNInfoSourceReg(VNI) != LHS.reg)
-        continue;
+    // DstReg is known to be a register in the RHS interval.  If the src is
+    // from the LHS interval, we can use its value #.
+    if (!CP.isCoalescable(VNI->getCopy()))
+      continue;
 
-      // Figure out the value # from the LHS.
-      LiveRange *lr = LHS.getLiveRangeContaining(VNI->def.getPrevSlot());
-      assert(lr && "Cannot find live range");
-      RHSValsDefinedFromLHS[VNI] = lr->valno;
-    }
+    // Figure out the value # from the LHS.
+    LiveRange *lr = LHS.getLiveRangeContaining(VNI->def.getPrevSlot());
+    // The copy could be to an aliased physreg.
+    if (!lr) continue;
+    RHSValsDefinedFromLHS[VNI] = lr->valno;
+  }
 
-    LHSValNoAssignments.resize(LHS.getNumValNums(), -1);
-    RHSValNoAssignments.resize(RHS.getNumValNums(), -1);
-    NewVNInfo.reserve(LHS.getNumValNums() + RHS.getNumValNums());
+  LHSValNoAssignments.resize(LHS.getNumValNums(), -1);
+  RHSValNoAssignments.resize(RHS.getNumValNums(), -1);
+  NewVNInfo.reserve(LHS.getNumValNums() + RHS.getNumValNums());
 
-    for (LiveInterval::vni_iterator i = LHS.vni_begin(), e = LHS.vni_end();
-         i != e; ++i) {
-      VNInfo *VNI = *i;
-      unsigned VN = VNI->id;
-      if (LHSValNoAssignments[VN] >= 0 || VNI->isUnused())
-        continue;
-      ComputeUltimateVN(VNI, NewVNInfo,
-                        LHSValsDefinedFromRHS, RHSValsDefinedFromLHS,
-                        LHSValNoAssignments, RHSValNoAssignments);
+  for (LiveInterval::vni_iterator i = LHS.vni_begin(), e = LHS.vni_end();
+       i != e; ++i) {
+    VNInfo *VNI = *i;
+    unsigned VN = VNI->id;
+    if (LHSValNoAssignments[VN] >= 0 || VNI->isUnused())
+      continue;
+    ComputeUltimateVN(VNI, NewVNInfo,
+                      LHSValsDefinedFromRHS, RHSValsDefinedFromLHS,
+                      LHSValNoAssignments, RHSValNoAssignments);
+  }
+  for (LiveInterval::vni_iterator i = RHS.vni_begin(), e = RHS.vni_end();
+       i != e; ++i) {
+    VNInfo *VNI = *i;
+    unsigned VN = VNI->id;
+    if (RHSValNoAssignments[VN] >= 0 || VNI->isUnused())
+      continue;
+    // If this value number isn't a copy from the LHS, it's a new number.
+    if (RHSValsDefinedFromLHS.find(VNI) == RHSValsDefinedFromLHS.end()) {
+      NewVNInfo.push_back(VNI);
+      RHSValNoAssignments[VN] = NewVNInfo.size()-1;
+      continue;
     }
-    for (LiveInterval::vni_iterator i = RHS.vni_begin(), e = RHS.vni_end();
-         i != e; ++i) {
-      VNInfo *VNI = *i;
-      unsigned VN = VNI->id;
-      if (RHSValNoAssignments[VN] >= 0 || VNI->isUnused())
-        continue;
-      // If this value number isn't a copy from the LHS, it's a new number.
-      if (RHSValsDefinedFromLHS.find(VNI) == RHSValsDefinedFromLHS.end()) {
-        NewVNInfo.push_back(VNI);
-        RHSValNoAssignments[VN] = NewVNInfo.size()-1;
-        continue;
-      }
 
-      ComputeUltimateVN(VNI, NewVNInfo,
-                        RHSValsDefinedFromLHS, LHSValsDefinedFromRHS,
-                        RHSValNoAssignments, LHSValNoAssignments);
-    }
+    ComputeUltimateVN(VNI, NewVNInfo,
+                      RHSValsDefinedFromLHS, LHSValsDefinedFromRHS,
+                      RHSValNoAssignments, LHSValNoAssignments);
   }
 
   // Armed with the mappings of LHS/RHS values to ultimate values, walk the
@@ -2399,15 +1440,17 @@ SimpleRegisterCoalescing::JoinIntervals(LiveInterval &LHS, LiveInterval &RHS,
   LiveInterval::const_iterator JE = RHS.end();
 
   // Skip ahead until the first place of potential sharing.
-  if (I->start < J->start) {
-    I = std::upper_bound(I, IE, J->start);
-    if (I != LHS.begin()) --I;
-  } else if (J->start < I->start) {
-    J = std::upper_bound(J, JE, I->start);
-    if (J != RHS.begin()) --J;
+  if (I != IE && J != JE) {
+    if (I->start < J->start) {
+      I = std::upper_bound(I, IE, J->start);
+      if (I != LHS.begin()) --I;
+    } else if (J->start < I->start) {
+      J = std::upper_bound(J, JE, I->start);
+      if (J != RHS.begin()) --J;
+    }
   }
 
-  while (1) {
+  while (I != IE && J != JE) {
     // Determine if these two live ranges overlap.
     bool Overlaps;
     if (I->start < J->start) {
@@ -2429,13 +1472,10 @@ SimpleRegisterCoalescing::JoinIntervals(LiveInterval &LHS, LiveInterval &RHS,
         return false;
     }
 
-    if (I->end < J->end) {
+    if (I->end < J->end)
       ++I;
-      if (I == IE) break;
-    } else {
+    else
       ++J;
-      if (J == JE) break;
-    }
   }
 
   // Update kill info. Some live ranges are extended due to copy coalescing.
@@ -2443,10 +1483,8 @@ SimpleRegisterCoalescing::JoinIntervals(LiveInterval &LHS, LiveInterval &RHS,
          E = LHSValsDefinedFromRHS.end(); I != E; ++I) {
     VNInfo *VNI = I->first;
     unsigned LHSValID = LHSValNoAssignments[VNI->id];
-    NewVNInfo[LHSValID]->removeKill(VNI->def);
     if (VNI->hasPHIKill())
       NewVNInfo[LHSValID]->setHasPHIKill(true);
-    RHS.addKills(NewVNInfo[LHSValID], VNI->kills);
   }
 
   // Update kill info. Some live ranges are extended due to copy coalescing.
@@ -2454,25 +1492,19 @@ SimpleRegisterCoalescing::JoinIntervals(LiveInterval &LHS, LiveInterval &RHS,
          E = RHSValsDefinedFromLHS.end(); I != E; ++I) {
     VNInfo *VNI = I->first;
     unsigned RHSValID = RHSValNoAssignments[VNI->id];
-    NewVNInfo[RHSValID]->removeKill(VNI->def);
     if (VNI->hasPHIKill())
       NewVNInfo[RHSValID]->setHasPHIKill(true);
-    LHS.addKills(NewVNInfo[RHSValID], VNI->kills);
   }
 
+  if (LHSValNoAssignments.empty())
+    LHSValNoAssignments.push_back(-1);
+  if (RHSValNoAssignments.empty())
+    RHSValNoAssignments.push_back(-1);
+
   // If we get here, we know that we can coalesce the live ranges.  Ask the
   // intervals to coalesce themselves now.
-  if ((RHS.ranges.size() > LHS.ranges.size() &&
-      TargetRegisterInfo::isVirtualRegister(LHS.reg)) ||
-      TargetRegisterInfo::isPhysicalRegister(RHS.reg)) {
-    RHS.join(LHS, &RHSValNoAssignments[0], &LHSValNoAssignments[0], NewVNInfo,
-             mri_);
-    Swapped = true;
-  } else {
-    LHS.join(RHS, &LHSValNoAssignments[0], &RHSValNoAssignments[0], NewVNInfo,
-             mri_);
-    Swapped = false;
-  }
+  LHS.join(RHS, &LHSValNoAssignments[0], &RHSValNoAssignments[0], NewVNInfo,
+           mri_);
   return true;
 }
 
@@ -2513,15 +1545,10 @@ void SimpleRegisterCoalescing::CopyCoalesceInMBB(MachineBasicBlock *MBB,
     // If this isn't a copy nor a extract_subreg, we can't join intervals.
     unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
     bool isInsUndef = false;
-    if (Inst->isExtractSubreg()) {
+    if (Inst->isCopy()) {
       DstReg = Inst->getOperand(0).getReg();
       SrcReg = Inst->getOperand(1).getReg();
-    } else if (Inst->isInsertSubreg()) {
-      DstReg = Inst->getOperand(0).getReg();
-      SrcReg = Inst->getOperand(2).getReg();
-      if (Inst->getOperand(1).isUndef())
-        isInsUndef = true;
-    } else if (Inst->isInsertSubreg() || Inst->isSubregToReg()) {
+    } else if (Inst->isSubregToReg()) {
       DstReg = Inst->getOperand(0).getReg();
       SrcReg = Inst->getOperand(2).getReg();
     } else if (!tii_->isMoveInstr(*Inst, SrcReg, DstReg, SrcSubIdx, DstSubIdx))
@@ -2650,6 +1677,8 @@ SimpleRegisterCoalescing::lastRegisterUse(SlotIndex Start,
            E = mri_->use_nodbg_end(); I != E; ++I) {
       MachineOperand &Use = I.getOperand();
       MachineInstr *UseMI = Use.getParent();
+      if (UseMI->isIdentityCopy())
+        continue;
       unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
       if (tii_->isMoveInstr(*UseMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx) &&
           SrcReg == DstReg && SrcSubIdx == DstSubIdx)
@@ -2680,7 +1709,8 @@ SimpleRegisterCoalescing::lastRegisterUse(SlotIndex Start,
 
     // Ignore identity copies.
     unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
-    if (!(tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx) &&
+    if (!MI->isIdentityCopy() &&
+        !(tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx) &&
           SrcReg == DstReg && SrcSubIdx == DstSubIdx))
       for (unsigned i = 0, NumOps = MI->getNumOperands(); i != NumOps; ++i) {
         MachineOperand &Use = MI->getOperand(i);
@@ -2750,10 +1780,9 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
         // Delete all coalesced copies.
         bool DoDelete = true;
         if (!tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
-          assert((MI->isExtractSubreg() || MI->isInsertSubreg() ||
-                  MI->isSubregToReg()) && "Unrecognized copy instruction");
-          DstReg = MI->getOperand(0).getReg();
-          if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+          assert(MI->isCopyLike() && "Unrecognized copy instruction");
+          SrcReg = MI->getOperand(MI->isSubregToReg() ? 2 : 1).getReg();
+          if (TargetRegisterInfo::isPhysicalRegister(SrcReg))
             // Do not delete extract_subreg, insert_subreg of physical
             // registers unless the definition is dead. e.g.
             // %DO<def> = INSERT_SUBREG %D0<undef>, %S0<kill>, 1
@@ -2762,7 +1791,7 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
             DoDelete = false;
         }
         if (MI->allDefsAreDead()) {
-          LiveInterval &li = li_->getInterval(DstReg);
+          LiveInterval &li = li_->getInterval(SrcReg);
           if (!ShortenDeadCopySrcLiveRange(li, MI))
             ShortenDeadCopyLiveRange(li, MI);
           DoDelete = true;
@@ -2812,12 +1841,13 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
 
       // If the move will be an identity move delete it
       bool isMove= tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx);
-      if (isMove && SrcReg == DstReg && SrcSubIdx == DstSubIdx) {
+      if (MI->isIdentityCopy() ||
+          (isMove && SrcReg == DstReg && SrcSubIdx == DstSubIdx)) {
         if (li_->hasInterval(SrcReg)) {
           LiveInterval &RegInt = li_->getInterval(SrcReg);
           // If def of this move instruction is dead, remove its live range
-          // from the dstination register's live interval.
-          if (MI->registerDefIsDead(DstReg)) {
+          // from the destination register's live interval.
+          if (MI->allDefsAreDead()) {
             if (!ShortenDeadCopySrcLiveRange(RegInt, MI))
               ShortenDeadCopyLiveRange(RegInt, MI);
           }
@@ -2832,17 +1862,13 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
 
       // Check for now unnecessary kill flags.
       if (li_->isNotInMIMap(MI)) continue;
-      SlotIndex UseIdx = li_->getInstructionIndex(MI).getUseIndex();
+      SlotIndex DefIdx = li_->getInstructionIndex(MI).getDefIndex();
       for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
         MachineOperand &MO = MI->getOperand(i);
         if (!MO.isReg() || !MO.isKill()) continue;
         unsigned reg = MO.getReg();
         if (!reg || !li_->hasInterval(reg)) continue;
-        LiveInterval &LI = li_->getInterval(reg);
-        const LiveRange *LR = LI.getLiveRangeContaining(UseIdx);
-        if (!LR ||
-            (!LR->valno->isKill(UseIdx.getDefIndex()) &&
-             LR->valno->def != UseIdx.getDefIndex()))
+        if (!li_->getInterval(reg).killedAt(DefIdx))
           MO.setIsKill(false);
       }
     }
diff --git a/lib/CodeGen/SimpleRegisterCoalescing.h b/lib/CodeGen/SimpleRegisterCoalescing.h
index 1be04f32aa69..e154da60affa 100644
--- a/lib/CodeGen/SimpleRegisterCoalescing.h
+++ b/lib/CodeGen/SimpleRegisterCoalescing.h
@@ -105,21 +105,12 @@ namespace llvm {
     /// possible to coalesce this interval, but it may be possible if other
     /// things get coalesced, then it returns true by reference in 'Again'.
     bool JoinCopy(CopyRec &TheCopy, bool &Again);
-    
+
     /// JoinIntervals - Attempt to join these two intervals.  On failure, this
-    /// returns false.  Otherwise, if one of the intervals being joined is a
-    /// physreg, this method always canonicalizes DestInt to be it.  The output
-    /// "SrcInt" will not have been modified, so we can use this information
-    /// below to update aliases.
-    bool JoinIntervals(LiveInterval &LHS, LiveInterval &RHS, bool &Swapped);
-    
-    /// SimpleJoin - Attempt to join the specified interval into this one. The
-    /// caller of this method must guarantee that the RHS only contains a single
-    /// value number and that the RHS is not defined by a copy from this
-    /// interval.  This returns false if the intervals are not joinable, or it
-    /// joins them and returns true.
-    bool SimpleJoin(LiveInterval &LHS, LiveInterval &RHS);
-    
+    /// returns false.  The output "SrcInt" will not have been modified, so we can
+    /// use this information below to update aliases.
+    bool JoinIntervals(CoalescerPair &CP);
+
     /// Return true if the two specified registers belong to different register
     /// classes.  The registers may be either phys or virt regs.
     bool differingRegisterClasses(unsigned RegA, unsigned RegB) const;
@@ -128,8 +119,7 @@ namespace llvm {
     /// the source value number is defined by a copy from the destination reg
     /// see if we can merge these two destination reg valno# into a single
     /// value number, eliminating a copy.
-    bool AdjustCopiesBackFrom(LiveInterval &IntA, LiveInterval &IntB,
-                              MachineInstr *CopyMI);
+    bool AdjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);
 
     /// HasOtherReachingDefs - Return true if there are definitions of IntB
     /// other than BValNo val# that can reach uses of AValno val# of IntA.
@@ -140,8 +130,7 @@ namespace llvm {
     /// If the source value number is defined by a commutable instruction and
     /// its other operand is coalesced to the copy dest register, see if we
     /// can transform the copy into a noop by commuting the definition.
-    bool RemoveCopyByCommutingDef(LiveInterval &IntA, LiveInterval &IntB,
-                                  MachineInstr *CopyMI);
+    bool RemoveCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
 
     /// TrimLiveIntervalToLastUse - If there is a last use in the same basic
     /// block as the copy instruction, trim the ive interval to the last use
@@ -155,28 +144,6 @@ namespace llvm {
     bool ReMaterializeTrivialDef(LiveInterval &SrcInt, unsigned DstReg,
                                  unsigned DstSubIdx, MachineInstr *CopyMI);
 
-    /// CanCoalesceWithImpDef - Returns true if the specified copy instruction
-    /// from an implicit def to another register can be coalesced away.
-    bool CanCoalesceWithImpDef(MachineInstr *CopyMI,
-                               LiveInterval &li, LiveInterval &ImpLi) const;
-
-    /// TurnCopiesFromValNoToImpDefs - The specified value# is defined by an
-    /// implicit_def and it is being removed. Turn all copies from this value#
-    /// into implicit_defs.
-    void TurnCopiesFromValNoToImpDefs(LiveInterval &li, VNInfo *VNI);
-
-    /// isWinToJoinVRWithSrcPhysReg - Return true if it's worth while to join a
-    /// a virtual destination register with physical source register.
-    bool isWinToJoinVRWithSrcPhysReg(MachineInstr *CopyMI,
-                                    MachineBasicBlock *CopyMBB,
-                                    LiveInterval &DstInt, LiveInterval &SrcInt);
-
-    /// isWinToJoinVRWithDstPhysReg - Return true if it's worth while to join a
-    /// copy from a virtual source register to a physical destination register.
-    bool isWinToJoinVRWithDstPhysReg(MachineInstr *CopyMI,
-                                    MachineBasicBlock *CopyMBB,
-                                    LiveInterval &DstInt, LiveInterval &SrcInt);
-
     /// isWinToJoinCrossClass - Return true if it's profitable to coalesce
     /// two virtual registers from different register classes.
     bool isWinToJoinCrossClass(unsigned SrcReg,
@@ -185,43 +152,12 @@ namespace llvm {
                                const TargetRegisterClass *DstRC,
                                const TargetRegisterClass *NewRC);
 
-    /// HasIncompatibleSubRegDefUse - If we are trying to coalesce a virtual
-    /// register with a physical register, check if any of the virtual register
-    /// operand is a sub-register use or def. If so, make sure it won't result
-    /// in an illegal extract_subreg or insert_subreg instruction.
-    bool HasIncompatibleSubRegDefUse(MachineInstr *CopyMI,
-                                     unsigned VirtReg, unsigned PhysReg);
-
-    /// CanJoinExtractSubRegToPhysReg - Return true if it's possible to coalesce
-    /// an extract_subreg where dst is a physical register, e.g.
-    /// cl = EXTRACT_SUBREG reg1024, 1
-    bool CanJoinExtractSubRegToPhysReg(unsigned DstReg, unsigned SrcReg,
-                                       unsigned SubIdx, unsigned &RealDstReg);
-
-    /// CanJoinInsertSubRegToPhysReg - Return true if it's possible to coalesce
-    /// an insert_subreg where src is a physical register, e.g.
-    /// reg1024 = INSERT_SUBREG reg1024, c1, 0
-    bool CanJoinInsertSubRegToPhysReg(unsigned DstReg, unsigned SrcReg,
-                                      unsigned SubIdx, unsigned &RealDstReg);
-
-    /// ValueLiveAt - Return true if the LiveRange pointed to by the given
-    /// iterator, or any subsequent range with the same value number,
-    /// is live at the given point.
-    bool ValueLiveAt(LiveInterval::iterator LRItr, LiveInterval::iterator LREnd, 
-                     SlotIndex defPoint) const;                                  
-
-    /// RangeIsDefinedByCopyFromReg - Return true if the specified live range of
-    /// the specified live interval is defined by a copy from the specified
-    /// register.
-    bool RangeIsDefinedByCopyFromReg(LiveInterval &li, LiveRange *LR,
-                                     unsigned Reg);
-
     /// UpdateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
     /// update the subregister number if it is not zero. If DstReg is a
     /// physical register and the existing subregister number of the def / use
     /// being updated is not zero, make sure to set it to the correct physical
     /// subregister.
-    void UpdateRegDefsUses(unsigned SrcReg, unsigned DstReg, unsigned SubIdx);
+    void UpdateRegDefsUses(const CoalescerPair &CP);
 
     /// ShortenDeadCopyLiveRange - Shorten a live range defined by a dead copy.
     /// Return true if live interval is removed.
@@ -238,6 +174,10 @@ namespace llvm {
     /// it as well.
     bool RemoveDeadDef(LiveInterval &li, MachineInstr *DefMI);
 
+    /// RemoveCopyFlag - If DstReg is no longer defined by CopyMI, clear the
+    /// VNInfo copy flag for DstReg and all aliases.
+    void RemoveCopyFlag(unsigned DstReg, const MachineInstr *CopyMI);
+
     /// lastRegisterUse - Returns the last use of the specific register between
     /// cycles Start and End or NULL if there are no uses.
     MachineOperand *lastRegisterUse(SlotIndex Start, SlotIndex End,
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 059e8d6c19aa..e90869d600dd 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -46,6 +46,8 @@ namespace {
     Constant *UnregisterFn;
     Constant *BuiltinSetjmpFn;
     Constant *FrameAddrFn;
+    Constant *StackAddrFn;
+    Constant *StackRestoreFn;
     Constant *LSDAAddrFn;
     Value *PersonalityFn;
     Constant *SelectorFn;
@@ -69,7 +71,7 @@ namespace {
     void insertCallSiteStore(Instruction *I, int Number, Value *CallSite);
     void markInvokeCallSite(InvokeInst *II, int InvokeNo, Value *CallSite,
                             SwitchInst *CatchSwitch);
-    void splitLiveRangesLiveAcrossInvokes(SmallVector<InvokeInst*,16> &Invokes);
+    void splitLiveRangesAcrossInvokes(SmallVector<InvokeInst*,16> &Invokes);
     bool insertSjLjEHSupport(Function &F);
   };
 } // end anonymous namespace
@@ -107,6 +109,8 @@ bool SjLjEHPass::doInitialization(Module &M) {
                           PointerType::getUnqual(FunctionContextTy),
                           (Type *)0);
   FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress);
+  StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);
+  StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore);
   BuiltinSetjmpFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_setjmp);
   LSDAAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_lsda);
   SelectorFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_selector);
@@ -175,8 +179,10 @@ static void MarkBlocksLiveIn(BasicBlock *BB, std::set<BasicBlock*> &LiveBBs) {
 /// we spill into a stack location, guaranteeing that there is nothing live
 /// across the unwind edge.  This process also splits all critical edges
 /// coming out of invoke's.
+/// FIXME: Move this function to a common utility file (Local.cpp?) so
+/// both SjLj and LowerInvoke can use it.
 void SjLjEHPass::
-splitLiveRangesLiveAcrossInvokes(SmallVector<InvokeInst*,16> &Invokes) {
+splitLiveRangesAcrossInvokes(SmallVector<InvokeInst*,16> &Invokes) {
   // First step, split all critical edges from invoke instructions.
   for (unsigned i = 0, e = Invokes.size(); i != e; ++i) {
     InvokeInst *II = Invokes[i];
@@ -198,16 +204,33 @@ splitLiveRangesLiveAcrossInvokes(SmallVector<InvokeInst*,16> &Invokes) {
     ++AfterAllocaInsertPt;
   for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
        AI != E; ++AI) {
-    // This is always a no-op cast because we're casting AI to AI->getType() so
-    // src and destination types are identical. BitCast is the only possibility.
-    CastInst *NC = new BitCastInst(
-      AI, AI->getType(), AI->getName()+".tmp", AfterAllocaInsertPt);
-    AI->replaceAllUsesWith(NC);
-    // Normally its is forbidden to replace a CastInst's operand because it
-    // could cause the opcode to reflect an illegal conversion. However, we're
-    // replacing it here with the same value it was constructed with to simply
-    // make NC its user.
-    NC->setOperand(0, AI);
+    const Type *Ty = AI->getType();
+    // Aggregate types can't be cast, but are legal argument types, so we have
+    // to handle them differently. We use an extract/insert pair as a
+    // lightweight method to achieve the same goal.
+    if (isa<StructType>(Ty) || isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
+      Instruction *EI = ExtractValueInst::Create(AI, 0, "",AfterAllocaInsertPt);
+      Instruction *NI = InsertValueInst::Create(AI, EI, 0);
+      NI->insertAfter(EI);
+      AI->replaceAllUsesWith(NI);
+      // Set the operand of the instructions back to the AllocaInst.
+      EI->setOperand(0, AI);
+      NI->setOperand(0, AI);
+    } else {
+      // This is always a no-op cast because we're casting AI to AI->getType()
+      // so src and destination types are identical. BitCast is the only
+      // possibility.
+      CastInst *NC = new BitCastInst(
+        AI, AI->getType(), AI->getName()+".tmp", AfterAllocaInsertPt);
+      AI->replaceAllUsesWith(NC);
+      // Set the operand of the cast instruction back to the AllocaInst.
+      // Normally it's forbidden to replace a CastInst's operand because it
+      // could cause the opcode to reflect an illegal conversion. However,
+      // we're replacing it here with the same value it was constructed with.
+      // We do this because the above replaceAllUsesWith() clobbered the
+      // operand, but we want this one to remain.
+      NC->setOperand(0, AI);
+    }
   }
 
   // Finally, scan the code looking for instructions with bad live ranges.
@@ -266,6 +289,9 @@ splitLiveRangesLiveAcrossInvokes(SmallVector<InvokeInst*,16> &Invokes) {
       }
 
       // If we decided we need a spill, do it.
+      // FIXME: Spilling this way is overkill, as it forces all uses of
+      // the value to be reloaded from the stack slot, even those that aren't
+      // in the unwind blocks. We should be more selective.
       if (NeedsSpill) {
         ++NumSpilled;
         DemoteRegToStack(*Inst, true);
@@ -294,22 +320,34 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
   // If we don't have any invokes or unwinds, there's nothing to do.
   if (Unwinds.empty() && Invokes.empty()) return false;
 
-  // Find the eh.selector.*  and eh.exception calls. We'll use the first
-  // eh.selector to determine the right personality function to use. For
-  // SJLJ, we always use the same personality for the whole function,
-  // not on a per-selector basis.
+  // Find the eh.selector.*, eh.exception and alloca calls.
+  //
+  // Remember any allocas() that aren't in the entry block, as the
+  // jmpbuf saved SP will need to be updated for them.
+  //
+  // We'll use the first eh.selector to determine the right personality
+  // function to use. For SJLJ, we always use the same personality for the
+  // whole function, not on a per-selector basis.
   // FIXME: That's a bit ugly. Better way?
   SmallVector<CallInst*,16> EH_Selectors;
   SmallVector<CallInst*,16> EH_Exceptions;
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+  SmallVector<Instruction*,16> JmpbufUpdatePoints;
+  // Note: Skip the entry block since there's nothing there that interests
+  // us. eh.selector and eh.exception shouldn't ever be there, and we
+  // want to disregard any allocas that are there.
+  for (Function::iterator BB = F.begin(), E = F.end(); ++BB != E;) {
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
       if (CallInst *CI = dyn_cast<CallInst>(I)) {
         if (CI->getCalledFunction() == SelectorFn) {
-          if (!PersonalityFn) PersonalityFn = CI->getOperand(2);
+          if (!PersonalityFn) PersonalityFn = CI->getArgOperand(1);
           EH_Selectors.push_back(CI);
         } else if (CI->getCalledFunction() == ExceptionFn) {
           EH_Exceptions.push_back(CI);
+        } else if (CI->getCalledFunction() == StackRestoreFn) {
+          JmpbufUpdatePoints.push_back(CI);
         }
+      } else if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+        JmpbufUpdatePoints.push_back(AI);
       }
     }
   }
@@ -329,7 +367,7 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
     // we spill into a stack location, guaranteeing that there is nothing live
     // across the unwind edge.  This process also splits all critical edges
     // coming out of invoke's.
-    splitLiveRangesLiveAcrossInvokes(Invokes);
+    splitLiveRangesAcrossInvokes(Invokes);
 
     BasicBlock *EntryBB = F.begin();
     // Create an alloca for the incoming jump buffer ptr and the new jump buffer
@@ -419,7 +457,7 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
     // Populate the Function Context
     //   1. LSDA address
     //   2. Personality function address
-    //   3. jmpbuf (save FP and call eh.sjlj.setjmp)
+    //   3. jmpbuf (save SP, FP and call eh.sjlj.setjmp)
 
     // LSDA address
     Idxs[0] = Zero;
@@ -440,31 +478,41 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
     new StoreInst(PersonalityFn, PersonalityFieldPtr, true,
                   EntryBB->getTerminator());
 
-    //   Save the frame pointer.
+    // Save the frame pointer.
     Idxs[1] = ConstantInt::get(Int32Ty, 5);
-    Value *FieldPtr
+    Value *JBufPtr
       = GetElementPtrInst::Create(FunctionContext, Idxs, Idxs+2,
                                   "jbuf_gep",
                                   EntryBB->getTerminator());
     Idxs[1] = ConstantInt::get(Int32Ty, 0);
-    Value *ElemPtr =
-      GetElementPtrInst::Create(FieldPtr, Idxs, Idxs+2, "jbuf_fp_gep",
+    Value *FramePtr =
+      GetElementPtrInst::Create(JBufPtr, Idxs, Idxs+2, "jbuf_fp_gep",
                                 EntryBB->getTerminator());
 
     Value *Val = CallInst::Create(FrameAddrFn,
                                   ConstantInt::get(Int32Ty, 0),
                                   "fp",
                                   EntryBB->getTerminator());
-    new StoreInst(Val, ElemPtr, true, EntryBB->getTerminator());
-    // Call the setjmp instrinsic. It fills in the rest of the jmpbuf
+    new StoreInst(Val, FramePtr, true, EntryBB->getTerminator());
+
+    // Save the stack pointer.
+    Idxs[1] = ConstantInt::get(Int32Ty, 2);
+    Value *StackPtr =
+      GetElementPtrInst::Create(JBufPtr, Idxs, Idxs+2, "jbuf_sp_gep",
+                                EntryBB->getTerminator());
+
+    Val = CallInst::Create(StackAddrFn, "sp", EntryBB->getTerminator());
+    new StoreInst(Val, StackPtr, true, EntryBB->getTerminator());
+
+    // Call the setjmp instrinsic. It fills in the rest of the jmpbuf.
     Value *SetjmpArg =
-      CastInst::Create(Instruction::BitCast, FieldPtr,
+      CastInst::Create(Instruction::BitCast, JBufPtr,
                        Type::getInt8PtrTy(F.getContext()), "",
                        EntryBB->getTerminator());
     Value *DispatchVal = CallInst::Create(BuiltinSetjmpFn, SetjmpArg,
                                           "dispatch",
                                           EntryBB->getTerminator());
-    // check the return value of the setjmp. non-zero goes to dispatcher
+    // check the return value of the setjmp. non-zero goes to dispatcher.
     Value *IsNormal = new ICmpInst(EntryBB->getTerminator(),
                                    ICmpInst::ICMP_EQ, DispatchVal, Zero,
                                    "notunwind");
@@ -509,6 +557,16 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
       Unwinds[i]->eraseFromParent();
     }
 
+    // Following any allocas not in the entry block, update the saved SP
+    // in the jmpbuf to the new value.
+    for (unsigned i = 0, e = JmpbufUpdatePoints.size(); i != e; ++i) {
+      Instruction *AI = JmpbufUpdatePoints[i];
+      Instruction *StackAddr = CallInst::Create(StackAddrFn, "sp");
+      StackAddr->insertAfter(AI);
+      Instruction *StoreStackAddr = new StoreInst(StackAddr, StackPtr, true);
+      StoreStackAddr->insertAfter(StackAddr);
+    }
+
     // Finally, for any returns from this function, if this function contains an
     // invoke, add a call to unregister the function context.
     for (unsigned i = 0, e = Returns.size(); i != e; ++i)
diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp
index 6110ef5d2f05..7a227cf02d57 100644
--- a/lib/CodeGen/SlotIndexes.cpp
+++ b/lib/CodeGen/SlotIndexes.cpp
@@ -213,9 +213,11 @@ void SlotIndexes::dump() const {
 
 // Print a SlotIndex to a raw_ostream.
 void SlotIndex::print(raw_ostream &os) const {
-  os << getIndex();
+  os << entry().getIndex();
   if (isPHI())
     os << "*";
+  else
+    os << "LudS"[getSlot()];
 }
 
 // Dump a SlotIndex to stderr.
diff --git a/lib/CodeGen/Spiller.cpp b/lib/CodeGen/Spiller.cpp
index a7b2efe11825..56bcb2824ae8 100644
--- a/lib/CodeGen/Spiller.cpp
+++ b/lib/CodeGen/Spiller.cpp
@@ -14,18 +14,20 @@
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <set>
 
 using namespace llvm;
 
 namespace {
-  enum SpillerName { trivial, standard, splitting };
+  enum SpillerName { trivial, standard, splitting, inline_ };
 }
 
 static cl::opt<SpillerName>
@@ -35,6 +37,7 @@ spillerOpt("spiller",
            cl::values(clEnumVal(trivial,   "trivial spiller"),
                       clEnumVal(standard,  "default spiller"),
                       clEnumVal(splitting, "splitting spiller"),
+                      clEnumValN(inline_,  "inline", "inline spiller"),
                       clEnumValEnd),
            cl::init(standard));
 
@@ -53,8 +56,8 @@ protected:
   const TargetInstrInfo *tii;
   const TargetRegisterInfo *tri;
   VirtRegMap *vrm;
-  
-  /// Construct a spiller base. 
+
+  /// Construct a spiller base.
   SpillerBase(MachineFunction *mf, LiveIntervals *lis, VirtRegMap *vrm)
     : mf(mf), lis(lis), vrm(vrm)
   {
@@ -67,7 +70,8 @@ protected:
   /// Add spill ranges for every use/def of the live interval, inserting loads
   /// immediately before each use, and stores after each def. No folding or
   /// remat is attempted.
-  std::vector<LiveInterval*> trivialSpillEverywhere(LiveInterval *li) {
+  void trivialSpillEverywhere(LiveInterval *li,
+                              std::vector<LiveInterval*> &newIntervals) {
     DEBUG(dbgs() << "Spilling everywhere " << *li << "\n");
 
     assert(li->weight != HUGE_VALF &&
@@ -78,8 +82,6 @@ protected:
 
     DEBUG(dbgs() << "Trivial spill everywhere of reg" << li->reg << "\n");
 
-    std::vector<LiveInterval*> added;
-    
     const TargetRegisterClass *trc = mri->getRegClass(li->reg);
     unsigned ss = vrm->assignVirt2StackSlot(li->reg);
 
@@ -96,7 +98,7 @@ protected:
       do {
         ++regItr;
       } while (regItr != mri->reg_end() && (&*regItr == mi));
-      
+
       // Collect uses & defs for this instr.
       SmallVector<unsigned, 2> indices;
       bool hasUse = false;
@@ -116,7 +118,7 @@ protected:
       vrm->assignVirt2StackSlot(newVReg, ss);
       LiveInterval *newLI = &lis->getOrCreateInterval(newVReg);
       newLI->weight = HUGE_VALF;
-      
+
       // Update the reg operands & kill flags.
       for (unsigned i = 0; i < indices.size(); ++i) {
         unsigned mopIdx = indices[i];
@@ -136,10 +138,10 @@ protected:
         MachineInstr *loadInstr(prior(miItr));
         SlotIndex loadIndex =
           lis->InsertMachineInstrInMaps(loadInstr).getDefIndex();
+        vrm->addSpillSlotUse(ss, loadInstr);
         SlotIndex endIndex = loadIndex.getNextIndex();
         VNInfo *loadVNI =
           newLI->getNextValue(loadIndex, 0, true, lis->getVNInfoAllocator());
-        loadVNI->addKill(endIndex);
         newLI->addRange(LiveRange(loadIndex, endIndex, loadVNI));
       }
 
@@ -150,17 +152,15 @@ protected:
         MachineInstr *storeInstr(llvm::next(miItr));
         SlotIndex storeIndex =
           lis->InsertMachineInstrInMaps(storeInstr).getDefIndex();
+        vrm->addSpillSlotUse(ss, storeInstr);
         SlotIndex beginIndex = storeIndex.getPrevIndex();
         VNInfo *storeVNI =
           newLI->getNextValue(beginIndex, 0, true, lis->getVNInfoAllocator());
-        storeVNI->addKill(storeIndex);
         newLI->addRange(LiveRange(beginIndex, storeIndex, storeVNI));
       }
 
-      added.push_back(newLI);
+      newIntervals.push_back(newLI);
     }
-
-    return added;
   }
 };
 
@@ -176,11 +176,12 @@ public:
   TrivialSpiller(MachineFunction *mf, LiveIntervals *lis, VirtRegMap *vrm)
     : SpillerBase(mf, lis, vrm) {}
 
-  std::vector<LiveInterval*> spill(LiveInterval *li,
-                                   SmallVectorImpl<LiveInterval*> &spillIs,
-                                   SlotIndex*) {
+  void spill(LiveInterval *li,
+             std::vector<LiveInterval*> &newIntervals,
+             SmallVectorImpl<LiveInterval*> &,
+             SlotIndex*) {
     // Ignore spillIs - we don't use it.
-    return trivialSpillEverywhere(li);
+    trivialSpillEverywhere(li, newIntervals);
   }
 };
 
@@ -200,10 +201,13 @@ public:
     : lis(lis), loopInfo(loopInfo), vrm(vrm) {}
 
   /// Falls back on LiveIntervals::addIntervalsForSpills.
-  std::vector<LiveInterval*> spill(LiveInterval *li,
-                                   SmallVectorImpl<LiveInterval*> &spillIs,
-                                   SlotIndex*) {
-    return lis->addIntervalsForSpills(*li, spillIs, loopInfo, *vrm);
+  void spill(LiveInterval *li,
+             std::vector<LiveInterval*> &newIntervals,
+             SmallVectorImpl<LiveInterval*> &spillIs,
+             SlotIndex*) {
+    std::vector<LiveInterval*> added =
+      lis->addIntervalsForSpills(*li, spillIs, loopInfo, *vrm);
+    newIntervals.insert(newIntervals.end(), added.begin(), added.end());
   }
 };
 
@@ -214,7 +218,7 @@ namespace {
 /// When a call to spill is placed this spiller will first try to break the
 /// interval up into its component values (one new interval per value).
 /// If this fails, or if a call is placed to spill a previously split interval
-/// then the spiller falls back on the standard spilling mechanism. 
+/// then the spiller falls back on the standard spilling mechanism.
 class SplittingSpiller : public StandardSpiller {
 public:
   SplittingSpiller(MachineFunction *mf, LiveIntervals *lis,
@@ -226,22 +230,21 @@ public:
     tri = mf->getTarget().getRegisterInfo();
   }
 
-  std::vector<LiveInterval*> spill(LiveInterval *li,
-                                   SmallVectorImpl<LiveInterval*> &spillIs,
-                                   SlotIndex *earliestStart) {
-    
-    if (worthTryingToSplit(li)) {
-      return tryVNISplit(li, earliestStart);
-    }
-    // else
-    return StandardSpiller::spill(li, spillIs, earliestStart);
+  void spill(LiveInterval *li,
+             std::vector<LiveInterval*> &newIntervals,
+             SmallVectorImpl<LiveInterval*> &spillIs,
+             SlotIndex *earliestStart) {
+    if (worthTryingToSplit(li))
+      tryVNISplit(li, earliestStart);
+    else
+      StandardSpiller::spill(li, newIntervals, spillIs, earliestStart);
   }
 
 private:
 
   MachineRegisterInfo *mri;
   const TargetInstrInfo *tii;
-  const TargetRegisterInfo *tri;  
+  const TargetRegisterInfo *tri;
   DenseSet<LiveInterval*> alreadySplit;
 
   bool worthTryingToSplit(LiveInterval *li) const {
@@ -258,18 +261,18 @@ private:
     SmallVector<VNInfo*, 4> vnis;
 
     std::copy(li->vni_begin(), li->vni_end(), std::back_inserter(vnis));
-   
+
     for (SmallVectorImpl<VNInfo*>::iterator vniItr = vnis.begin(),
          vniEnd = vnis.end(); vniItr != vniEnd; ++vniItr) {
       VNInfo *vni = *vniItr;
-      
-      // Skip unused VNIs, or VNIs with no kills.
-      if (vni->isUnused() || vni->kills.empty())
+
+      // Skip unused VNIs.
+      if (vni->isUnused())
         continue;
 
       DEBUG(dbgs() << "  Extracted Val #" << vni->id << " as ");
       LiveInterval *splitInterval = extractVNI(li, vni);
-      
+
       if (splitInterval != 0) {
         DEBUG(dbgs() << *splitInterval << "\n");
         added.push_back(splitInterval);
@@ -281,12 +284,12 @@ private:
       } else {
         DEBUG(dbgs() << "0\n");
       }
-    } 
+    }
 
     DEBUG(dbgs() << "Original LI: " << *li << "\n");
 
     // If there original interval still contains some live ranges
-    // add it to added and alreadySplit.    
+    // add it to added and alreadySplit.
     if (!li->empty()) {
       added.push_back(li);
       alreadySplit.insert(li);
@@ -302,16 +305,15 @@ private:
   /// Extract the given value number from the interval.
   LiveInterval* extractVNI(LiveInterval *li, VNInfo *vni) const {
     assert(vni->isDefAccurate() || vni->isPHIDef());
-    assert(!vni->kills.empty());
 
-    // Create a new vreg and live interval, copy VNI kills & ranges over.                                                                                                                                                     
+    // Create a new vreg and live interval, copy VNI ranges over.
     const TargetRegisterClass *trc = mri->getRegClass(li->reg);
     unsigned newVReg = mri->createVirtualRegister(trc);
     vrm->grow();
     LiveInterval *newLI = &lis->getOrCreateInterval(newVReg);
     VNInfo *newVNI = newLI->createValueCopy(vni, lis->getVNInfoAllocator());
 
-    // Start by copying all live ranges in the VN to the new interval.                                                                                                                                                        
+    // Start by copying all live ranges in the VN to the new interval.
     for (LiveInterval::iterator rItr = li->begin(), rEnd = li->end();
          rItr != rEnd; ++rItr) {
       if (rItr->valno == vni) {
@@ -319,7 +321,7 @@ private:
       }
     }
 
-    // Erase the old VNI & ranges.                                                                                                                                                                                            
+    // Erase the old VNI & ranges.
     li->removeValNo(vni);
 
     // Collect all current uses of the register belonging to the given VNI.
@@ -336,15 +338,13 @@ private:
       // Insert a copy at the start of the MBB. The range proceeding the
       // copy will be attached to the original LiveInterval.
       MachineBasicBlock *defMBB = lis->getMBBFromIndex(newVNI->def);
-      tii->copyRegToReg(*defMBB, defMBB->begin(), newVReg, li->reg, trc, trc,
-                        DebugLoc());
-      MachineInstr *copyMI = defMBB->begin();
-      copyMI->addRegisterKilled(li->reg, tri);
+      MachineInstr *copyMI = BuildMI(*defMBB, defMBB->begin(), DebugLoc(),
+                                     tii->get(TargetOpcode::COPY), newVReg)
+                               .addReg(li->reg, RegState::Kill);
       SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
       VNInfo *phiDefVNI = li->getNextValue(lis->getMBBStartIdx(defMBB),
                                            0, false, lis->getVNInfoAllocator());
       phiDefVNI->setIsPHIDef(true);
-      phiDefVNI->addKill(copyIdx.getDefIndex());
       li->addRange(LiveRange(phiDefVNI->def, copyIdx.getDefIndex(), phiDefVNI));
       LiveRange *oldPHIDefRange =
         newLI->getLiveRangeContaining(lis->getMBBStartIdx(defMBB));
@@ -367,8 +367,8 @@ private:
       newVNI->setIsPHIDef(false); // not a PHI def anymore.
       newVNI->setIsDefAccurate(true);
     } else {
-      // non-PHI def. Rename the def. If it's two-addr that means renaming the use
-      // and inserting a new copy too.
+      // non-PHI def. Rename the def. If it's two-addr that means renaming the
+      // use and inserting a new copy too.
       MachineInstr *defInst = lis->getInstructionFromIndex(newVNI->def);
       // We'll rename this now, so we can remove it from uses.
       uses.erase(defInst);
@@ -384,38 +384,26 @@ private:
             twoAddrUseIsUndef = true;
         }
       }
-    
+
       SlotIndex defIdx = lis->getInstructionIndex(defInst);
       newVNI->def = defIdx.getDefIndex();
 
       if (isTwoAddr && !twoAddrUseIsUndef) {
         MachineBasicBlock *defMBB = defInst->getParent();
-        tii->copyRegToReg(*defMBB, defInst, newVReg, li->reg, trc, trc,
-                          DebugLoc());
-        MachineInstr *copyMI = prior(MachineBasicBlock::iterator(defInst));
+        MachineInstr *copyMI = BuildMI(*defMBB, defInst, DebugLoc(),
+                                       tii->get(TargetOpcode::COPY), newVReg)
+                                 .addReg(li->reg, RegState::Kill);
         SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
-        copyMI->addRegisterKilled(li->reg, tri);
         LiveRange *origUseRange =
           li->getLiveRangeContaining(newVNI->def.getUseIndex());
-        VNInfo *origUseVNI = origUseRange->valno;
         origUseRange->end = copyIdx.getDefIndex();
-        bool updatedKills = false;
-        for (unsigned k = 0; k < origUseVNI->kills.size(); ++k) {
-          if (origUseVNI->kills[k] == defIdx.getDefIndex()) {
-            origUseVNI->kills[k] = copyIdx.getDefIndex();
-            updatedKills = true;
-            break;
-          }
-        }
-        assert(updatedKills && "Failed to update VNI kill list.");
         VNInfo *copyVNI = newLI->getNextValue(copyIdx.getDefIndex(), copyMI,
                                               true, lis->getVNInfoAllocator());
-        copyVNI->addKill(defIdx.getDefIndex());
         LiveRange copyRange(copyIdx.getDefIndex(),defIdx.getDefIndex(),copyVNI);
         newLI->addRange(copyRange);
-      }    
+      }
     }
-    
+
     for (std::set<MachineInstr*>::iterator
          usesItr = uses.begin(), usesEnd = uses.end();
          usesItr != usesEnd; ++usesItr) {
@@ -435,7 +423,7 @@ private:
       // Check if this instr is two address.
       unsigned useOpIdx = useInst->findRegisterUseOperandIdx(li->reg);
       bool isTwoAddress = useInst->isRegTiedToDefOperand(useOpIdx);
-      
+
       // Rename uses (and defs for two-address instrs).
       for (unsigned i = 0; i < useInst->getNumOperands(); ++i) {
         MachineOperand &mo = useInst->getOperand(i);
@@ -451,10 +439,9 @@ private:
         // reg.
         MachineBasicBlock *useMBB = useInst->getParent();
         MachineBasicBlock::iterator useItr(useInst);
-        tii->copyRegToReg(*useMBB, llvm::next(useItr), li->reg, newVReg, trc, trc,
-                          DebugLoc());
-        MachineInstr *copyMI = llvm::next(useItr);
-        copyMI->addRegisterKilled(newVReg, tri);
+        MachineInstr *copyMI = BuildMI(*useMBB, llvm::next(useItr), DebugLoc(),
+                                       tii->get(TargetOpcode::COPY), newVReg)
+                                 .addReg(li->reg, RegState::Kill);
         SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
 
         // Change the old two-address defined range & vni to start at
@@ -470,56 +457,44 @@ private:
         VNInfo *copyVNI =
           newLI->getNextValue(useIdx.getDefIndex(), 0, true,
                               lis->getVNInfoAllocator());
-        copyVNI->addKill(copyIdx.getDefIndex());
         LiveRange copyRange(useIdx.getDefIndex(),copyIdx.getDefIndex(),copyVNI);
         newLI->addRange(copyRange);
       }
     }
-    
-    // Iterate over any PHI kills - we'll need to insert new copies for them.
-    for (VNInfo::KillSet::iterator
-         killItr = newVNI->kills.begin(), killEnd = newVNI->kills.end();
-         killItr != killEnd; ++killItr) {
-      SlotIndex killIdx(*killItr);
-      if (killItr->isPHI()) {
-        MachineBasicBlock *killMBB = lis->getMBBFromIndex(killIdx);
-        LiveRange *oldKillRange =
-          newLI->getLiveRangeContaining(killIdx);
-
-        assert(oldKillRange != 0 && "No kill range?");
-
-        tii->copyRegToReg(*killMBB, killMBB->getFirstTerminator(),
-                          li->reg, newVReg, trc, trc,
-                          DebugLoc());
-        MachineInstr *copyMI = prior(killMBB->getFirstTerminator());
-        copyMI->addRegisterKilled(newVReg, tri);
-        SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
 
-        // Save the current end. We may need it to add a new range if the
-        // current range runs of the end of the MBB.
-        SlotIndex newKillRangeEnd = oldKillRange->end;
-        oldKillRange->end = copyIdx.getDefIndex();
+    // Iterate over any PHI kills - we'll need to insert new copies for them.
+    for (LiveInterval::iterator LRI = newLI->begin(), LRE = newLI->end();
+         LRI != LRE; ++LRI) {
+      if (LRI->valno != newVNI || LRI->end.isPHI())
+        continue;
+      SlotIndex killIdx = LRI->end;
+      MachineBasicBlock *killMBB = lis->getMBBFromIndex(killIdx);
+      MachineInstr *copyMI = BuildMI(*killMBB, killMBB->getFirstTerminator(),
+                                     DebugLoc(), tii->get(TargetOpcode::COPY),
+                                     li->reg)
+                               .addReg(newVReg, RegState::Kill);
+      SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
 
-        if (newKillRangeEnd != lis->getMBBEndIdx(killMBB)) {
-          assert(newKillRangeEnd > lis->getMBBEndIdx(killMBB) &&
-                 "PHI kill range doesn't reach kill-block end. Not sane.");
-          newLI->addRange(LiveRange(lis->getMBBEndIdx(killMBB),
-                                    newKillRangeEnd, newVNI));
-        }
+      // Save the current end. We may need it to add a new range if the
+      // current range runs of the end of the MBB.
+      SlotIndex newKillRangeEnd = LRI->end;
+      LRI->end = copyIdx.getDefIndex();
 
-        *killItr = oldKillRange->end;
-        VNInfo *newKillVNI = li->getNextValue(copyIdx.getDefIndex(),
-                                              copyMI, true,
-                                              lis->getVNInfoAllocator());
-        newKillVNI->addKill(lis->getMBBTerminatorGap(killMBB));
-        newKillVNI->setHasPHIKill(true);
-        li->addRange(LiveRange(copyIdx.getDefIndex(),
-                               lis->getMBBEndIdx(killMBB),
-                               newKillVNI));
+      if (newKillRangeEnd != lis->getMBBEndIdx(killMBB)) {
+        assert(newKillRangeEnd > lis->getMBBEndIdx(killMBB) &&
+               "PHI kill range doesn't reach kill-block end. Not sane.");
+        newLI->addRange(LiveRange(lis->getMBBEndIdx(killMBB),
+                                  newKillRangeEnd, newVNI));
       }
 
+      VNInfo *newKillVNI = li->getNextValue(copyIdx.getDefIndex(),
+                                            copyMI, true,
+                                            lis->getVNInfoAllocator());
+      newKillVNI->setHasPHIKill(true);
+      li->addRange(LiveRange(copyIdx.getDefIndex(),
+                             lis->getMBBEndIdx(killMBB),
+                             newKillVNI));
     }
-
     newVNI->setHasPHIKill(false);
 
     return newLI;
@@ -530,6 +505,13 @@ private:
 } // end anonymous namespace
 
 
+namespace llvm {
+Spiller *createInlineSpiller(MachineFunction*,
+                             LiveIntervals*,
+                             const MachineLoopInfo*,
+                             VirtRegMap*);
+}
+
 llvm::Spiller* llvm::createSpiller(MachineFunction *mf, LiveIntervals *lis,
                                    const MachineLoopInfo *loopInfo,
                                    VirtRegMap *vrm) {
@@ -538,5 +520,6 @@ llvm::Spiller* llvm::createSpiller(MachineFunction *mf, LiveIntervals *lis,
   case trivial: return new TrivialSpiller(mf, lis, vrm);
   case standard: return new StandardSpiller(lis, loopInfo, vrm);
   case splitting: return new SplittingSpiller(mf, lis, loopInfo, vrm);
+  case inline_: return createInlineSpiller(mf, lis, loopInfo, vrm);
   }
 }
diff --git a/lib/CodeGen/Spiller.h b/lib/CodeGen/Spiller.h
index dda52e871fea..450447b3933a 100644
--- a/lib/CodeGen/Spiller.h
+++ b/lib/CodeGen/Spiller.h
@@ -33,11 +33,19 @@ namespace llvm {
   public:
     virtual ~Spiller() = 0;
 
-    /// Spill the given live range. The method used will depend on the Spiller
-    /// implementation selected.
-    virtual std::vector<LiveInterval*> spill(LiveInterval *li,
-					     SmallVectorImpl<LiveInterval*> &spillIs,
-                                             SlotIndex *earliestIndex = 0) = 0;
+    /// spill - Spill the given live interval. The method used will depend on
+    /// the Spiller implementation selected.
+    ///
+    /// @param li            The live interval to be spilled.
+    /// @param spillIs       A list of intervals that are about to be spilled,
+    ///                      and so cannot be used for remat etc.
+    /// @param newIntervals  The newly created intervals will be appended here.
+    /// @param earliestIndex The earliest point for splitting. (OK, it's another
+    ///                      pointer to the allocator guts).
+    virtual void spill(LiveInterval *li,
+                       std::vector<LiveInterval*> &newIntervals,
+                       SmallVectorImpl<LiveInterval*> &spillIs,
+                       SlotIndex *earliestIndex = 0) = 0;
 
   };
 
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index 8a6a727a1f97..ca5c28ce010c 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -136,7 +136,7 @@ bool StackProtector::RequiresStackProtector() const {
 bool StackProtector::InsertStackProtectors() {
   BasicBlock *FailBB = 0;       // The basic block to jump to if check fails.
   AllocaInst *AI = 0;           // Place on stack that stores the stack guard.
-  Constant *StackGuardVar = 0;  // The stack guard variable.
+  Value *StackGuardVar = 0;  // The stack guard variable.
 
   for (Function::iterator I = F->begin(), E = F->end(); I != E; ) {
     BasicBlock *BB = I++;
@@ -153,9 +153,17 @@ bool StackProtector::InsertStackProtectors() {
       //     StackGuard = load __stack_chk_guard
       //     call void @llvm.stackprotect.create(StackGuard, StackGuardSlot)
       // 
-      PointerType *PtrTy = PointerType::getUnqual(
-          Type::getInt8Ty(RI->getContext()));
-      StackGuardVar = M->getOrInsertGlobal("__stack_chk_guard", PtrTy);
+      const PointerType *PtrTy = Type::getInt8PtrTy(RI->getContext());
+      unsigned AddressSpace, Offset;
+      if (TLI->getStackCookieLocation(AddressSpace, Offset)) {
+        Constant *OffsetVal =
+          ConstantInt::get(Type::getInt32Ty(RI->getContext()), Offset);
+        
+        StackGuardVar = ConstantExpr::getIntToPtr(OffsetVal,
+                                      PointerType::get(PtrTy, AddressSpace));
+      } else {
+        StackGuardVar = M->getOrInsertGlobal("__stack_chk_guard", PtrTy); 
+      }
 
       BasicBlock &Entry = F->getEntryBlock();
       Instruction *InsPt = &Entry.front();
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index 7f3b452f0a5a..eff3c33e3daa 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -508,8 +509,7 @@ bool StackSlotColoring::PropagateBackward(MachineBasicBlock::iterator MII,
 
         // Abort the use is actually a sub-register def. We don't have enough
         // information to figure out if it is really legal.
-        if (MO.getSubReg() || MII->isExtractSubreg() ||
-            MII->isInsertSubreg() || MII->isSubregToReg())
+        if (MO.getSubReg() || MII->isSubregToReg())
           return false;
 
         const TargetRegisterClass *RC = TID.OpInfo[i].getRegClass(TRI);
@@ -571,7 +571,7 @@ bool StackSlotColoring::PropagateForward(MachineBasicBlock::iterator MII,
 
         // Abort the use is actually a sub-register use. We don't have enough
         // information to figure out if it is really legal.
-        if (MO.getSubReg() || MII->isExtractSubreg())
+        if (MO.getSubReg())
           return false;
 
         const TargetRegisterClass *RC = TID.OpInfo[i].getRegClass(TRI);
@@ -610,8 +610,8 @@ StackSlotColoring::UnfoldAndRewriteInstruction(MachineInstr *MI, int OldFI,
       DEBUG(MI->dump());
       ++NumLoadElim;
     } else {
-      TII->copyRegToReg(*MBB, MI, DstReg, Reg, RC, RC,
-                        MI->getDebugLoc());
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(TargetOpcode::COPY),
+              DstReg).addReg(Reg);
       ++NumRegRepl;
     }
 
@@ -627,8 +627,8 @@ StackSlotColoring::UnfoldAndRewriteInstruction(MachineInstr *MI, int OldFI,
       DEBUG(MI->dump());
       ++NumStoreElim;
     } else {
-      TII->copyRegToReg(*MBB, MI, Reg, SrcReg, RC, RC,
-                        MI->getDebugLoc());
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(TargetOpcode::COPY), Reg)
+        .addReg(SrcReg);
       ++NumRegRepl;
     }
 
diff --git a/lib/CodeGen/StrongPHIElimination.cpp b/lib/CodeGen/StrongPHIElimination.cpp
index 142398cc1642..59315cf67282 100644
--- a/lib/CodeGen/StrongPHIElimination.cpp
+++ b/lib/CodeGen/StrongPHIElimination.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterCoalescer.h"
@@ -695,9 +696,8 @@ void StrongPHIElimination::ScheduleCopies(MachineBasicBlock* MBB,
         // Insert copy from curr.second to a temporary at
         // the Phi defining curr.second
         MachineBasicBlock::iterator PI = MRI.getVRegDef(curr.second);
-        TII->copyRegToReg(*PI->getParent(), PI, t,
-                          curr.second, RC, RC, DebugLoc());
-        
+        BuildMI(*PI->getParent(), PI, DebugLoc(), TII->get(TargetOpcode::COPY),
+                t).addReg(curr.second);
         DEBUG(dbgs() << "Inserted copy from " << curr.second << " to " << t
                      << "\n");
         
@@ -712,8 +712,8 @@ void StrongPHIElimination::ScheduleCopies(MachineBasicBlock* MBB,
       }
       
       // Insert copy from map[curr.first] to curr.second
-      TII->copyRegToReg(*MBB, MBB->getFirstTerminator(), curr.second,
-                        map[curr.first], RC, RC, DebugLoc());
+      BuildMI(*MBB, MBB->getFirstTerminator(), DebugLoc(),
+             TII->get(TargetOpcode::COPY), curr.second).addReg(map[curr.first]);
       map[curr.first] = curr.second;
       DEBUG(dbgs() << "Inserted copy from " << curr.first << " to "
                    << curr.second << "\n");
@@ -761,8 +761,8 @@ void StrongPHIElimination::ScheduleCopies(MachineBasicBlock* MBB,
         
         // Insert a copy from dest to a new temporary t at the end of b
         unsigned t = MF->getRegInfo().createVirtualRegister(RC);
-        TII->copyRegToReg(*MBB, MBB->getFirstTerminator(), t,
-                          curr.second, RC, RC, DebugLoc());
+        BuildMI(*MBB, MBB->getFirstTerminator(), DebugLoc(),
+                TII->get(TargetOpcode::COPY), t).addReg(curr.second);
         map[curr.second] = t;
         
         MachineBasicBlock::iterator TI = MBB->getFirstTerminator();
@@ -830,9 +830,6 @@ void StrongPHIElimination::InsertCopies(MachineDomTreeNode* MDTN,
         LiveInterval& Int = LI.getInterval(I->getOperand(i).getReg());
         VNInfo* FirstVN = *Int.vni_begin();
         FirstVN->setHasPHIKill(false);
-        if (I->getOperand(i).isKill())
-          FirstVN->addKill(LI.getInstructionIndex(I).getUseIndex());
-        
         LiveRange LR (LI.getMBBStartIdx(I->getParent()),
                       LI.getInstructionIndex(I).getUseIndex().getNextSlot(),
                       FirstVN);
@@ -959,9 +956,8 @@ bool StrongPHIElimination::runOnMachineFunction(MachineFunction &Fn) {
         } else {
           // Insert a last-minute copy if a conflict was detected.
           const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
-          const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(I->first);
-          TII->copyRegToReg(*SI->second, SI->second->getFirstTerminator(),
-                            I->first, SI->first, RC, RC, DebugLoc());
+          BuildMI(*SI->second, SI->second->getFirstTerminator(), DebugLoc(),
+                  TII->get(TargetOpcode::COPY), I->first).addReg(SI->first);
           
           LI.renumber();
           
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index f2e2a76f00eb..075db803bd23 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineSSAUpdater.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -559,11 +560,9 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
     }
     MachineBasicBlock::iterator Loc = PredBB->getFirstTerminator();
     for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) {
-      const TargetRegisterClass *RC = MRI->getRegClass(CopyInfos[i].first);
-      TII->copyRegToReg(*PredBB, Loc, CopyInfos[i].first,
-                        CopyInfos[i].second, RC,RC, DebugLoc());
-      MachineInstr *CopyMI = prior(Loc);
-      Copies.push_back(CopyMI);
+      Copies.push_back(BuildMI(*PredBB, Loc, DebugLoc(),
+                               TII->get(TargetOpcode::COPY),
+                               CopyInfos[i].first).addReg(CopyInfos[i].second));
     }
     NumInstrDups += TailBB->size() - 1; // subtract one for removed branch
 
@@ -618,11 +617,10 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
       }
       MachineBasicBlock::iterator Loc = PrevBB->getFirstTerminator();
       for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) {
-        const TargetRegisterClass *RC = MRI->getRegClass(CopyInfos[i].first);
-        TII->copyRegToReg(*PrevBB, Loc, CopyInfos[i].first,
-                          CopyInfos[i].second, RC, RC, DebugLoc());
-        MachineInstr *CopyMI = prior(Loc);
-        Copies.push_back(CopyMI);
+        Copies.push_back(BuildMI(*PrevBB, Loc, DebugLoc(),
+                                 TII->get(TargetOpcode::COPY),
+                                 CopyInfos[i].first)
+                           .addReg(CopyInfos[i].second));
       }
     } else {
       // No PHIs to worry about, just splice the instructions over.
diff --git a/lib/CodeGen/TargetInstrInfoImpl.cpp b/lib/CodeGen/TargetInstrInfoImpl.cpp
index 0ad6619ac4fd..cdacb98e0e88 100644
--- a/lib/CodeGen/TargetInstrInfoImpl.cpp
+++ b/lib/CodeGen/TargetInstrInfoImpl.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/ADT/SmallVector.h"
@@ -21,11 +22,34 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PostRAHazardRecognizer.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+/// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything
+/// after it, replacing it with an unconditional branch to NewDest.
+void
+TargetInstrInfoImpl::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
+                                             MachineBasicBlock *NewDest) const {
+  MachineBasicBlock *MBB = Tail->getParent();
+
+  // Remove all the old successors of MBB from the CFG.
+  while (!MBB->succ_empty())
+    MBB->removeSuccessor(MBB->succ_begin());
+
+  // Remove all the dead instructions from the end of MBB.
+  MBB->erase(Tail, MBB->end());
+
+  // If MBB isn't immediately before MBB, insert a branch to it.
+  if (++MachineFunction::iterator(MBB) != MachineFunction::iterator(NewDest))
+    InsertBranch(*MBB, NewDest, 0, SmallVector<MachineOperand, 0>(),
+                 Tail->getDebugLoc());
+  MBB->addSuccessor(NewDest);
+}
+
 // commuteInstruction - The default implementation of this method just exchanges
 // the two operands returned by findCommutedOpIndices.
 MachineInstr *TargetInstrInfoImpl::commuteInstruction(MachineInstr *MI,
@@ -136,17 +160,9 @@ void TargetInstrInfoImpl::reMaterialize(MachineBasicBlock &MBB,
                                         unsigned DestReg,
                                         unsigned SubIdx,
                                         const MachineInstr *Orig,
-                                        const TargetRegisterInfo *TRI) const {
+                                        const TargetRegisterInfo &TRI) const {
   MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
-  MachineOperand &MO = MI->getOperand(0);
-  if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
-    MO.setReg(DestReg);
-    MO.setSubReg(SubIdx);
-  } else if (SubIdx) {
-    MO.setReg(TRI->getSubReg(DestReg, SubIdx));
-  } else {
-    MO.setReg(DestReg);
-  }
+  MI->substituteRegister(MI->getOperand(0).getReg(), DestReg, SubIdx, TRI);
   MBB.insert(I, MI);
 }
 
@@ -175,6 +191,47 @@ TargetInstrInfoImpl::GetFunctionSizeInBytes(const MachineFunction &MF) const {
   return FnSize;
 }
 
+// If the COPY instruction in MI can be folded to a stack operation, return
+// the register class to use.
+static const TargetRegisterClass *canFoldCopy(const MachineInstr *MI,
+                                              unsigned FoldIdx) {
+  assert(MI->isCopy() && "MI must be a COPY instruction");
+  if (MI->getNumOperands() != 2)
+    return 0;
+  assert(FoldIdx<2 && "FoldIdx refers no nonexistent operand");
+
+  const MachineOperand &FoldOp = MI->getOperand(FoldIdx);
+  const MachineOperand &LiveOp = MI->getOperand(1-FoldIdx);
+
+  if (FoldOp.getSubReg() || LiveOp.getSubReg())
+    return 0;
+
+  unsigned FoldReg = FoldOp.getReg();
+  unsigned LiveReg = LiveOp.getReg();
+
+  assert(TargetRegisterInfo::isVirtualRegister(FoldReg) &&
+         "Cannot fold physregs");
+
+  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  const TargetRegisterClass *RC = MRI.getRegClass(FoldReg);
+
+  if (TargetRegisterInfo::isPhysicalRegister(LiveOp.getReg()))
+    return RC->contains(LiveOp.getReg()) ? RC : 0;
+
+  const TargetRegisterClass *LiveRC = MRI.getRegClass(LiveReg);
+  if (RC == LiveRC || RC->hasSubClass(LiveRC))
+    return RC;
+
+  // FIXME: Allow folding when register classes are memory compatible.
+  return 0;
+}
+
+bool TargetInstrInfoImpl::
+canFoldMemoryOperand(const MachineInstr *MI,
+                     const SmallVectorImpl<unsigned> &Ops) const {
+  return MI->isCopy() && Ops.size() == 1 && canFoldCopy(MI, Ops[0]);
+}
+
 /// foldMemoryOperand - Attempt to fold a load or store of the specified stack
 /// slot into the specified machine instruction for the specified operand(s).
 /// If this is possible, a new instruction is returned with the specified
@@ -182,10 +239,9 @@ TargetInstrInfoImpl::GetFunctionSizeInBytes(const MachineFunction &MF) const {
 /// removing the old instruction and adding the new one in the instruction
 /// stream.
 MachineInstr*
-TargetInstrInfo::foldMemoryOperand(MachineFunction &MF,
-                                   MachineInstr* MI,
+TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
                                    const SmallVectorImpl<unsigned> &Ops,
-                                   int FrameIndex) const {
+                                   int FI) const {
   unsigned Flags = 0;
   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
     if (MI->getOperand(Ops[i]).isDef())
@@ -193,34 +249,56 @@ TargetInstrInfo::foldMemoryOperand(MachineFunction &MF,
     else
       Flags |= MachineMemOperand::MOLoad;
 
+  MachineBasicBlock *MBB = MI->getParent();
+  assert(MBB && "foldMemoryOperand needs an inserted instruction");
+  MachineFunction &MF = *MBB->getParent();
+
   // Ask the target to do the actual folding.
-  MachineInstr *NewMI = foldMemoryOperandImpl(MF, MI, Ops, FrameIndex);
-  if (!NewMI) return 0;
+  if (MachineInstr *NewMI = foldMemoryOperandImpl(MF, MI, Ops, FI)) {
+    // Add a memory operand, foldMemoryOperandImpl doesn't do that.
+    assert((!(Flags & MachineMemOperand::MOStore) ||
+            NewMI->getDesc().mayStore()) &&
+           "Folded a def to a non-store!");
+    assert((!(Flags & MachineMemOperand::MOLoad) ||
+            NewMI->getDesc().mayLoad()) &&
+           "Folded a use to a non-load!");
+    const MachineFrameInfo &MFI = *MF.getFrameInfo();
+    assert(MFI.getObjectOffset(FI) != -1);
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
+                              Flags, /*Offset=*/0,
+                              MFI.getObjectSize(FI),
+                              MFI.getObjectAlignment(FI));
+    NewMI->addMemOperand(MF, MMO);
 
-  assert((!(Flags & MachineMemOperand::MOStore) ||
-          NewMI->getDesc().mayStore()) &&
-         "Folded a def to a non-store!");
-  assert((!(Flags & MachineMemOperand::MOLoad) ||
-          NewMI->getDesc().mayLoad()) &&
-         "Folded a use to a non-load!");
-  const MachineFrameInfo &MFI = *MF.getFrameInfo();
-  assert(MFI.getObjectOffset(FrameIndex) != -1);
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FrameIndex),
-                            Flags, /*Offset=*/0,
-                            MFI.getObjectSize(FrameIndex),
-                            MFI.getObjectAlignment(FrameIndex));
-  NewMI->addMemOperand(MF, MMO);
+    // FIXME: change foldMemoryOperandImpl semantics to also insert NewMI.
+    return MBB->insert(MI, NewMI);
+  }
 
-  return NewMI;
+  // Straight COPY may fold as load/store.
+  if (!MI->isCopy() || Ops.size() != 1)
+    return 0;
+
+  const TargetRegisterClass *RC = canFoldCopy(MI, Ops[0]);
+  if (!RC)
+    return 0;
+
+  const MachineOperand &MO = MI->getOperand(1-Ops[0]);
+  MachineBasicBlock::iterator Pos = MI;
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+
+  if (Flags == MachineMemOperand::MOStore)
+    storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI);
+  else
+    loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, TRI);
+  return --Pos;
 }
 
 /// foldMemoryOperand - Same as the previous version except it allows folding
 /// of any load and store from / to any address, not just from a specific
 /// stack slot.
 MachineInstr*
-TargetInstrInfo::foldMemoryOperand(MachineFunction &MF,
-                                   MachineInstr* MI,
+TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
                                    const SmallVectorImpl<unsigned> &Ops,
                                    MachineInstr* LoadMI) const {
   assert(LoadMI->getDesc().canFoldAsLoad() && "LoadMI isn't foldable!");
@@ -228,11 +306,15 @@ TargetInstrInfo::foldMemoryOperand(MachineFunction &MF,
   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
     assert(MI->getOperand(Ops[i]).isUse() && "Folding load into def!");
 #endif
+  MachineBasicBlock &MBB = *MI->getParent();
+  MachineFunction &MF = *MBB.getParent();
 
   // Ask the target to do the actual folding.
   MachineInstr *NewMI = foldMemoryOperandImpl(MF, MI, Ops, LoadMI);
   if (!NewMI) return 0;
 
+  NewMI = MBB.insert(MI, NewMI);
+
   // Copy the memoperands from the load to the folded instruction.
   NewMI->setMemRefs(LoadMI->memoperands_begin(),
                     LoadMI->memoperands_end());
@@ -240,11 +322,9 @@ TargetInstrInfo::foldMemoryOperand(MachineFunction &MF,
   return NewMI;
 }
 
-bool
-TargetInstrInfo::isReallyTriviallyReMaterializableGeneric(const MachineInstr *
-                                                            MI,
-                                                          AliasAnalysis *
-                                                            AA) const {
+bool TargetInstrInfo::
+isReallyTriviallyReMaterializableGeneric(const MachineInstr *MI,
+                                         AliasAnalysis *AA) const {
   const MachineFunction &MF = *MI->getParent()->getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   const TargetMachine &TM = MF.getTarget();
@@ -324,3 +404,31 @@ TargetInstrInfo::isReallyTriviallyReMaterializableGeneric(const MachineInstr *
   // Everything checked out.
   return true;
 }
+
+/// isSchedulingBoundary - Test if the given instruction should be
+/// considered a scheduling boundary. This primarily includes labels
+/// and terminators.
+bool TargetInstrInfoImpl::isSchedulingBoundary(const MachineInstr *MI,
+                                               const MachineBasicBlock *MBB,
+                                               const MachineFunction &MF) const{
+  // Terminators and labels can't be scheduled around.
+  if (MI->getDesc().isTerminator() || MI->isLabel())
+    return true;
+
+  // Don't attempt to schedule around any instruction that defines
+  // a stack-oriented pointer, as it's unlikely to be profitable. This
+  // saves compile time, because it doesn't require every single
+  // stack slot reference to depend on the instruction that does the
+  // modification.
+  const TargetLowering &TLI = *MF.getTarget().getTargetLowering();
+  if (MI->definesRegister(TLI.getStackPointerRegisterToSaveRestore()))
+    return true;
+
+  return false;
+}
+
+// Default implementation of CreateTargetPostRAHazardRecognizer.
+ScheduleHazardRecognizer *TargetInstrInfoImpl::
+CreateTargetPostRAHazardRecognizer(const InstrItineraryData &II) const {
+  return (ScheduleHazardRecognizer *)new PostRAHazardRecognizer(II);
+}
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 71ad3fb6f99f..a80cfc4b256f 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -825,32 +825,32 @@ void TargetLoweringObjectFileCOFF::Initialize(MCContext &Ctx,
   TargetLoweringObjectFile::Initialize(Ctx, TM);
   TextSection =
     getContext().getCOFFSection(".text",
-                                MCSectionCOFF::IMAGE_SCN_CNT_CODE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_EXECUTE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_CNT_CODE |
+                                COFF::IMAGE_SCN_MEM_EXECUTE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getText());
   DataSection =
     getContext().getCOFFSection(".data",
-                                MCSectionCOFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ |
-                                MCSectionCOFF::IMAGE_SCN_MEM_WRITE,
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ |
+                                COFF::IMAGE_SCN_MEM_WRITE,
                                 SectionKind::getDataRel());
   ReadOnlySection =
     getContext().getCOFFSection(".rdata",
-                                MCSectionCOFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getReadOnly());
   StaticCtorSection =
     getContext().getCOFFSection(".ctors",
-                                MCSectionCOFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ |
-                                MCSectionCOFF::IMAGE_SCN_MEM_WRITE,
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ |
+                                COFF::IMAGE_SCN_MEM_WRITE,
                                 SectionKind::getDataRel());
   StaticDtorSection =
     getContext().getCOFFSection(".dtors",
-                                MCSectionCOFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ |
-                                MCSectionCOFF::IMAGE_SCN_MEM_WRITE,
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ |
+                                COFF::IMAGE_SCN_MEM_WRITE,
                                 SectionKind::getDataRel());
 
   // FIXME: We're emitting LSDA info into a readonly section on COFF, even
@@ -859,76 +859,76 @@ void TargetLoweringObjectFileCOFF::Initialize(MCContext &Ctx,
   // adjusted or this should be a data section.
   LSDASection =
     getContext().getCOFFSection(".gcc_except_table",
-                                MCSectionCOFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getReadOnly());
   EHFrameSection =
     getContext().getCOFFSection(".eh_frame",
-                                MCSectionCOFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ |
-                                MCSectionCOFF::IMAGE_SCN_MEM_WRITE,
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ |
+                                COFF::IMAGE_SCN_MEM_WRITE,
                                 SectionKind::getDataRel());
 
   // Debug info.
   DwarfAbbrevSection =
     getContext().getCOFFSection(".debug_abbrev",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfInfoSection =
     getContext().getCOFFSection(".debug_info",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfLineSection =
     getContext().getCOFFSection(".debug_line",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfFrameSection =
     getContext().getCOFFSection(".debug_frame",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfPubNamesSection =
     getContext().getCOFFSection(".debug_pubnames",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfPubTypesSection =
     getContext().getCOFFSection(".debug_pubtypes",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfStrSection =
     getContext().getCOFFSection(".debug_str",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfLocSection =
     getContext().getCOFFSection(".debug_loc",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfARangesSection =
     getContext().getCOFFSection(".debug_aranges",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfRangesSection =
     getContext().getCOFFSection(".debug_ranges",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
   DwarfMacroInfoSection =
     getContext().getCOFFSection(".debug_macinfo",
-                                MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                MCSectionCOFF::IMAGE_SCN_MEM_READ,
+                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                COFF::IMAGE_SCN_MEM_READ,
                                 SectionKind::getMetadata());
 
   DrectveSection =
     getContext().getCOFFSection(".drectve",
-                                MCSectionCOFF::IMAGE_SCN_LNK_INFO,
+                                COFF::IMAGE_SCN_LNK_INFO,
                                 SectionKind::getMetadata());
 }
 
@@ -936,27 +936,27 @@ static unsigned
 getCOFFSectionFlags(SectionKind K) {
   unsigned Flags = 0;
 
-  if (!K.isMetadata())
+  if (K.isMetadata())
     Flags |=
-      MCSectionCOFF::IMAGE_SCN_MEM_DISCARDABLE;
+      COFF::IMAGE_SCN_MEM_DISCARDABLE;
   else if (K.isText())
     Flags |=
-      MCSectionCOFF::IMAGE_SCN_MEM_EXECUTE |
-      MCSectionCOFF::IMAGE_SCN_CNT_CODE;
+      COFF::IMAGE_SCN_MEM_EXECUTE |
+      COFF::IMAGE_SCN_CNT_CODE;
   else if (K.isBSS ())
     Flags |=
-      MCSectionCOFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA |
-      MCSectionCOFF::IMAGE_SCN_MEM_READ |
-      MCSectionCOFF::IMAGE_SCN_MEM_WRITE;
+      COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA |
+      COFF::IMAGE_SCN_MEM_READ |
+      COFF::IMAGE_SCN_MEM_WRITE;
   else if (K.isReadOnly())
     Flags |=
-      MCSectionCOFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-      MCSectionCOFF::IMAGE_SCN_MEM_READ;
+      COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+      COFF::IMAGE_SCN_MEM_READ;
   else if (K.isWriteable())
     Flags |=
-      MCSectionCOFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-      MCSectionCOFF::IMAGE_SCN_MEM_READ |
-      MCSectionCOFF::IMAGE_SCN_MEM_WRITE;
+      COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+      COFF::IMAGE_SCN_MEM_READ |
+      COFF::IMAGE_SCN_MEM_WRITE;
 
   return Flags;
 }
@@ -995,10 +995,10 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
 
     unsigned Characteristics = getCOFFSectionFlags(Kind);
 
-    Characteristics |= MCSectionCOFF::IMAGE_SCN_LNK_COMDAT;
+    Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
 
     return getContext().getCOFFSection(Name.str(), Characteristics,
-                          MCSectionCOFF::IMAGE_COMDAT_SELECT_EXACT_MATCH, Kind);
+                          COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH, Kind);
   }
 
   if (Kind.isText())
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 3d10dc13d20b..564914373bb5 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -33,6 +33,7 @@
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -381,7 +382,7 @@ static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
   DstReg = 0;
   unsigned SrcSubIdx, DstSubIdx;
   if (!TII->isMoveInstr(MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
-    if (MI.isExtractSubreg()) {
+    if (MI.isCopy()) {
       DstReg = MI.getOperand(0).getReg();
       SrcReg = MI.getOperand(1).getReg();
     } else if (MI.isInsertSubreg()) {
@@ -897,6 +898,108 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
       }
     }
   }
+
+  // If this is an instruction with a load folded into it, try unfolding
+  // the load, e.g. avoid this:
+  //   movq %rdx, %rcx
+  //   addq (%rax), %rcx
+  // in favor of this:
+  //   movq (%rax), %rcx
+  //   addq %rdx, %rcx
+  // because it's preferable to schedule a load than a register copy.
+  if (TID.mayLoad() && !regBKilled) {
+    // Determine if a load can be unfolded.
+    unsigned LoadRegIndex;
+    unsigned NewOpc =
+      TII->getOpcodeAfterMemoryUnfold(mi->getOpcode(),
+                                      /*UnfoldLoad=*/true,
+                                      /*UnfoldStore=*/false,
+                                      &LoadRegIndex);
+    if (NewOpc != 0) {
+      const TargetInstrDesc &UnfoldTID = TII->get(NewOpc);
+      if (UnfoldTID.getNumDefs() == 1) {
+        MachineFunction &MF = *mbbi->getParent();
+
+        // Unfold the load.
+        DEBUG(dbgs() << "2addr:   UNFOLDING: " << *mi);
+        const TargetRegisterClass *RC =
+          UnfoldTID.OpInfo[LoadRegIndex].getRegClass(TRI);
+        unsigned Reg = MRI->createVirtualRegister(RC);
+        SmallVector<MachineInstr *, 2> NewMIs;
+        if (!TII->unfoldMemoryOperand(MF, mi, Reg,
+                                      /*UnfoldLoad=*/true,/*UnfoldStore=*/false,
+                                      NewMIs)) {
+          DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
+          return false;
+        }
+        assert(NewMIs.size() == 2 &&
+               "Unfolded a load into multiple instructions!");
+        // The load was previously folded, so this is the only use.
+        NewMIs[1]->addRegisterKilled(Reg, TRI);
+
+        // Tentatively insert the instructions into the block so that they
+        // look "normal" to the transformation logic.
+        mbbi->insert(mi, NewMIs[0]);
+        mbbi->insert(mi, NewMIs[1]);
+
+        DEBUG(dbgs() << "2addr:    NEW LOAD: " << *NewMIs[0]
+                     << "2addr:    NEW INST: " << *NewMIs[1]);
+
+        // Transform the instruction, now that it no longer has a load.
+        unsigned NewDstIdx = NewMIs[1]->findRegisterDefOperandIdx(regA);
+        unsigned NewSrcIdx = NewMIs[1]->findRegisterUseOperandIdx(regB);
+        MachineBasicBlock::iterator NewMI = NewMIs[1];
+        bool TransformSuccess =
+          TryInstructionTransform(NewMI, mi, mbbi,
+                                  NewSrcIdx, NewDstIdx, Dist);
+        if (TransformSuccess ||
+            NewMIs[1]->getOperand(NewSrcIdx).isKill()) {
+          // Success, or at least we made an improvement. Keep the unfolded
+          // instructions and discard the original.
+          if (LV) {
+            for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
+              MachineOperand &MO = mi->getOperand(i);
+              if (MO.isReg() && MO.getReg() != 0 &&
+                  TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+                if (MO.isUse()) {
+                  if (MO.isKill()) {
+                    if (NewMIs[0]->killsRegister(MO.getReg()))
+                      LV->replaceKillInstruction(MO.getReg(), mi, NewMIs[0]);
+                    else {
+                      assert(NewMIs[1]->killsRegister(MO.getReg()) &&
+                             "Kill missing after load unfold!");
+                      LV->replaceKillInstruction(MO.getReg(), mi, NewMIs[1]);
+                    }
+                  }
+                } else if (LV->removeVirtualRegisterDead(MO.getReg(), mi)) {
+                  if (NewMIs[1]->registerDefIsDead(MO.getReg()))
+                    LV->addVirtualRegisterDead(MO.getReg(), NewMIs[1]);
+                  else {
+                    assert(NewMIs[0]->registerDefIsDead(MO.getReg()) &&
+                           "Dead flag missing after load unfold!");
+                    LV->addVirtualRegisterDead(MO.getReg(), NewMIs[0]);
+                  }
+                }
+              }
+            }
+            LV->addVirtualRegisterKilled(Reg, NewMIs[1]);
+          }
+          mi->eraseFromParent();
+          mi = NewMIs[1];
+          if (TransformSuccess)
+            return true;
+        } else {
+          // Transforming didn't eliminate the tie and didn't lead to an
+          // improvement. Clean up the unfolded instructions and keep the
+          // original.
+          DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
+          NewMIs[0]->eraseFromParent();
+          NewMIs[1]->eraseFromParent();
+        }
+      }
+    }
+  }
+
   return false;
 }
 
@@ -1047,14 +1150,12 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
               isProfitableToReMat(regB, rc, mi, DefMI, mbbi, Dist)){
             DEBUG(dbgs() << "2addr: REMATTING : " << *DefMI << "\n");
             unsigned regASubIdx = mi->getOperand(DstIdx).getSubReg();
-            TII->reMaterialize(*mbbi, mi, regA, regASubIdx, DefMI, TRI);
+            TII->reMaterialize(*mbbi, mi, regA, regASubIdx, DefMI, *TRI);
             ReMatRegs.set(regB);
             ++NumReMats;
           } else {
-            bool Emitted = TII->copyRegToReg(*mbbi, mi, regA, regB, rc, rc,
-                                             mi->getDebugLoc());
-            (void)Emitted;
-            assert(Emitted && "Unable to issue a copy instruction!\n");
+            BuildMI(*mbbi, mi, mi->getDebugLoc(), TII->get(TargetOpcode::COPY),
+                    regA).addReg(regB);
           }
 
           MachineBasicBlock::iterator prevMI = prior(mi);
@@ -1104,12 +1205,30 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
             }
           }
         }
-          
+
+        // Schedule the source copy / remat inserted to form two-address
+        // instruction. FIXME: Does it matter the distance map may not be
+        // accurate after it's scheduled?
+        TII->scheduleTwoAddrSource(prior(mi), mi, *TRI);
+
         MadeChange = true;
 
         DEBUG(dbgs() << "\t\trewrite to:\t" << *mi);
       }
 
+      // Rewrite INSERT_SUBREG as COPY now that we no longer need SSA form.
+      if (mi->isInsertSubreg()) {
+        // From %reg = INSERT_SUBREG %reg, %subreg, subidx
+        // To   %reg:subidx = COPY %subreg
+        unsigned SubIdx = mi->getOperand(3).getImm();
+        mi->RemoveOperand(3);
+        assert(mi->getOperand(0).getSubReg() == 0 && "Unexpected subreg idx");
+        mi->getOperand(0).setSubReg(SubIdx);
+        mi->RemoveOperand(1);
+        mi->setDesc(TII->get(TargetOpcode::COPY));
+        DEBUG(dbgs() << "\t\tconvert to:\t" << *mi);
+      }
+
       // Clear TiedOperands here instead of at the top of the loop
       // since most instructions do not have tied operands.
       TiedOperands.clear();
@@ -1136,14 +1255,13 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
 
 static void UpdateRegSequenceSrcs(unsigned SrcReg,
                                   unsigned DstReg, unsigned SubIdx,
-                                  MachineRegisterInfo *MRI) {
+                                  MachineRegisterInfo *MRI,
+                                  const TargetRegisterInfo &TRI) {
   for (MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(SrcReg),
          RE = MRI->reg_end(); RI != RE; ) {
     MachineOperand &MO = RI.getOperand();
     ++RI;
-    MO.setReg(DstReg);
-    assert(MO.getSubReg() == 0);
-    MO.setSubReg(SubIdx);
+    MO.substVirtReg(DstReg, SubIdx, TRI);
   }
 }
 
@@ -1165,55 +1283,102 @@ TwoAddressInstructionPass::CoalesceExtSubRegs(SmallVector<unsigned,4> &Srcs,
     if (!Seen.insert(SrcReg))
       continue;
 
-    // If there are no other uses than extract_subreg which feed into
+    // Check that the instructions are all in the same basic block.
+    MachineInstr *SrcDefMI = MRI->getVRegDef(SrcReg);
+    MachineInstr *DstDefMI = MRI->getVRegDef(DstReg);
+    if (SrcDefMI->getParent() != DstDefMI->getParent())
+      continue;
+
+    // If there are no other uses than copies which feed into
     // the reg_sequence, then we might be able to coalesce them.
     bool CanCoalesce = true;
-    SmallVector<unsigned, 4> SubIndices;
+    SmallVector<unsigned, 4> SrcSubIndices, DstSubIndices;
     for (MachineRegisterInfo::use_nodbg_iterator
            UI = MRI->use_nodbg_begin(SrcReg),
            UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
       MachineInstr *UseMI = &*UI;
-      if (!UseMI->isExtractSubreg() ||
-          UseMI->getOperand(0).getReg() != DstReg) {
+      if (!UseMI->isCopy() || UseMI->getOperand(0).getReg() != DstReg) {
         CanCoalesce = false;
         break;
       }
-      SubIndices.push_back(UseMI->getOperand(2).getImm());
+      SrcSubIndices.push_back(UseMI->getOperand(1).getSubReg());
+      DstSubIndices.push_back(UseMI->getOperand(0).getSubReg());
     }
 
-    if (!CanCoalesce || SubIndices.size() < 2)
+    if (!CanCoalesce || SrcSubIndices.size() < 2)
       continue;
 
-    std::sort(SubIndices.begin(), SubIndices.end());
-    unsigned NewSubIdx = 0;
-    if (TRI->canCombinedSubRegIndex(MRI->getRegClass(SrcReg), SubIndices,
-                                    NewSubIdx)) {
-      bool Proceed = true;
-      if (NewSubIdx)
-        for (MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(SrcReg),
-               RE = MRI->reg_end(); RI != RE; ) {
-          MachineOperand &MO = RI.getOperand();
-          ++RI;
-          // FIXME: If the sub-registers do not combine to the whole
-          // super-register, i.e. NewSubIdx != 0, and any of the use has a
-          // sub-register index, then abort the coalescing attempt.
-          if (MO.getSubReg()) {
-            Proceed = false;
-            break;
-          }
-          MO.setReg(DstReg);
-          MO.setSubReg(NewSubIdx);
-        }
-      if (Proceed)
-        for (MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(SrcReg),
-               RE = MRI->reg_end(); RI != RE; ) {
-          MachineOperand &MO = RI.getOperand();
-          ++RI;
-          MO.setReg(DstReg);
-          if (NewSubIdx)
-            MO.setSubReg(NewSubIdx);
-        }
+    // Check that the source subregisters can be combined.
+    std::sort(SrcSubIndices.begin(), SrcSubIndices.end());
+    unsigned NewSrcSubIdx = 0;
+    if (!TRI->canCombineSubRegIndices(MRI->getRegClass(SrcReg), SrcSubIndices,
+                                      NewSrcSubIdx))
+      continue;
+
+    // Check that the destination subregisters can also be combined.
+    std::sort(DstSubIndices.begin(), DstSubIndices.end());
+    unsigned NewDstSubIdx = 0;
+    if (!TRI->canCombineSubRegIndices(MRI->getRegClass(DstReg), DstSubIndices,
+                                      NewDstSubIdx))
+      continue;
+
+    // If neither source nor destination can be combined to the full register,
+    // just give up.  This could be improved if it ever matters.
+    if (NewSrcSubIdx != 0 && NewDstSubIdx != 0)
+      continue;
+
+    // Now that we know that all the uses are extract_subregs and that those
+    // subregs can somehow be combined, scan all the extract_subregs again to
+    // make sure the subregs are in the right order and can be composed.
+    MachineInstr *SomeMI = 0;
+    CanCoalesce = true;
+    for (MachineRegisterInfo::use_nodbg_iterator
+           UI = MRI->use_nodbg_begin(SrcReg),
+           UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
+      MachineInstr *UseMI = &*UI;
+      assert(UseMI->isCopy());
+      unsigned DstSubIdx = UseMI->getOperand(0).getSubReg();
+      unsigned SrcSubIdx = UseMI->getOperand(1).getSubReg();
+      assert(DstSubIdx != 0 && "missing subreg from RegSequence elimination");
+      if ((NewDstSubIdx == 0 &&
+           TRI->composeSubRegIndices(NewSrcSubIdx, DstSubIdx) != SrcSubIdx) ||
+          (NewSrcSubIdx == 0 &&
+           TRI->composeSubRegIndices(NewDstSubIdx, SrcSubIdx) != DstSubIdx)) {
+        CanCoalesce = false;
+        break;
+      }
+      // Keep track of one of the uses.
+      SomeMI = UseMI;
+    }
+    if (!CanCoalesce)
+      continue;
+
+    // Insert a copy to replace the original.
+    MachineBasicBlock::iterator InsertLoc = SomeMI;
+    MachineInstr *CopyMI = BuildMI(*SomeMI->getParent(), SomeMI,
+                                   SomeMI->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY))
+      .addReg(DstReg, RegState::Define, NewDstSubIdx)
+      .addReg(SrcReg, 0, NewSrcSubIdx);
+
+    // Remove all the old extract instructions.
+    for (MachineRegisterInfo::use_nodbg_iterator
+           UI = MRI->use_nodbg_begin(SrcReg),
+           UE = MRI->use_nodbg_end(); UI != UE; ) {
+      MachineInstr *UseMI = &*UI;
+      ++UI;
+      if (UseMI == CopyMI)
+        continue;
+      assert(UseMI->isCopy());
+      // Move any kills to the new copy or extract instruction.
+      if (UseMI->getOperand(1).isKill()) {
+        CopyMI->getOperand(1).setIsKill();
+        if (LV)
+          // Update live variables
+          LV->replaceKillInstruction(SrcReg, UseMI, &*CopyMI);
       }
+      UseMI->eraseFromParent();
+    }
   }
 }
 
@@ -1268,15 +1433,13 @@ bool TwoAddressInstructionPass::EliminateRegSequences() {
       }
       IsImpDef = false;
 
-      // Remember EXTRACT_SUBREG sources. These might be candidate for
-      // coalescing.
-      if (DefMI->isExtractSubreg())
+      // Remember COPY sources. These might be candidate for coalescing.
+      if (DefMI->isCopy() && DefMI->getOperand(1).getSubReg())
         RealSrcs.push_back(DefMI->getOperand(1).getReg());
 
-      if (!Seen.insert(SrcReg) ||
-          MI->getParent() != DefMI->getParent() ||
-          !MI->getOperand(i).isKill() ||
-          HasOtherRegSequenceUses(SrcReg, MI, MRI)) {
+      bool isKill = MI->getOperand(i).isKill();
+      if (!Seen.insert(SrcReg) || MI->getParent() != DefMI->getParent() ||
+          !isKill || HasOtherRegSequenceUses(SrcReg, MI, MRI)) {
         // REG_SEQUENCE cannot have duplicated operands, add a copy.
         // Also add an copy if the source is live-in the block. We don't want
         // to end up with a partial-redef of a livein, e.g.
@@ -1292,30 +1455,23 @@ bool TwoAddressInstructionPass::EliminateRegSequences() {
         // If the REG_SEQUENCE doesn't kill its source, keeping live variables
         // correctly up to date becomes very difficult. Insert a copy.
         //
-        const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
-        unsigned NewReg = MRI->createVirtualRegister(RC);
         MachineBasicBlock::iterator InsertLoc = MI;
-        bool Emitted =
-          TII->copyRegToReg(*MI->getParent(), InsertLoc, NewReg, SrcReg, RC, RC,
-                            MI->getDebugLoc());
-        (void)Emitted;
-        assert(Emitted && "Unable to issue a copy instruction!\n");
-        MI->getOperand(i).setReg(NewReg);
-        if (MI->getOperand(i).isKill()) {
-          MachineBasicBlock::iterator CopyMI = prior(InsertLoc);
-          MachineOperand *KillMO = CopyMI->findRegisterUseOperand(SrcReg);
-          KillMO->setIsKill();
-          if (LV)
-            // Update live variables
-            LV->replaceKillInstruction(SrcReg, MI, &*CopyMI);
-        }
+        MachineInstr *CopyMI = BuildMI(*MI->getParent(), InsertLoc,
+                                MI->getDebugLoc(), TII->get(TargetOpcode::COPY))
+            .addReg(DstReg, RegState::Define, MI->getOperand(i+1).getImm())
+            .addReg(SrcReg, getKillRegState(isKill));
+        MI->getOperand(i).setReg(0);
+        if (LV && isKill)
+          LV->replaceKillInstruction(SrcReg, MI, CopyMI);
+        DEBUG(dbgs() << "Inserted: " << *CopyMI);
       }
     }
 
     for (unsigned i = 1, e = MI->getNumOperands(); i < e; i += 2) {
       unsigned SrcReg = MI->getOperand(i).getReg();
+      if (!SrcReg) continue;
       unsigned SubIdx = MI->getOperand(i+1).getImm();
-      UpdateRegSequenceSrcs(SrcReg, DstReg, SubIdx, MRI);
+      UpdateRegSequenceSrcs(SrcReg, DstReg, SubIdx, MRI, *TRI);
     }
 
     if (IsImpDef) {
@@ -1328,8 +1484,11 @@ bool TwoAddressInstructionPass::EliminateRegSequences() {
       MI->eraseFromParent();
     }
 
-    // Try coalescing some EXTRACT_SUBREG instructions.
-    CoalesceExtSubRegs(RealSrcs, DstReg);
+    // Try coalescing some EXTRACT_SUBREG instructions. This can create
+    // INSERT_SUBREG instructions that must have <undef> flags added by
+    // LiveIntervalAnalysis, so only run it when LiveVariables is available.
+    if (LV)
+      CoalesceExtSubRegs(RealSrcs, DstReg);
   }
 
   RegSequences.clear();
diff --git a/lib/CodeGen/VirtRegRewriter.cpp b/lib/CodeGen/VirtRegRewriter.cpp
index 871d83628ac1..57a1500e6e9d 100644
--- a/lib/CodeGen/VirtRegRewriter.cpp
+++ b/lib/CodeGen/VirtRegRewriter.cpp
@@ -667,8 +667,7 @@ static void ReMaterialize(MachineBasicBlock &MBB,
   assert(TID.getNumDefs() == 1 &&
          "Don't know how to remat instructions that define > 1 values!");
 #endif
-  TII->reMaterialize(MBB, MII, DestReg,
-                     ReMatDefMI->getOperand(0).getSubReg(), ReMatDefMI, TRI);
+  TII->reMaterialize(MBB, MII, DestReg, 0, ReMatDefMI, *TRI);
   MachineInstr *NewMI = prior(MII);
   for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
     MachineOperand &MO = NewMI->getOperand(i);
@@ -769,7 +768,7 @@ void AvailableSpills::AddAvailableRegsToLiveIn(MachineBasicBlock &MBB,
          I = PhysRegsAvailable.begin(), E = PhysRegsAvailable.end();
        I != E; ++I) {
     unsigned Reg = I->first;
-    const TargetRegisterClass* RC = TRI->getPhysicalRegisterRegClass(Reg);
+    const TargetRegisterClass* RC = TRI->getMinimalPhysRegClass(Reg);
     // FIXME: A temporary workaround. We can't reuse available value if it's
     // not safe to move the def of the virtual register's class. e.g.
     // X86::RFP* register classes. Do not add it as a live-in.
@@ -1022,7 +1021,7 @@ static unsigned FindFreeRegister(MachineBasicBlock::iterator MII,
     for (unsigned i = 0, e = Kills.size(); i != e; ++i) {
       unsigned Kill = Kills[i];
       if (!Defs[Kill] && !Uses[Kill] &&
-          TRI->getPhysicalRegisterRegClass(Kill) == RC)
+          RC->contains(Kill))
         return Kill;
     }
     for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) {
@@ -1410,25 +1409,25 @@ OptimizeByUnfold(MachineBasicBlock::iterator &MII,
     if (TII->unfoldMemoryOperand(MF, &MI, UnfoldVR, false, false, NewMIs)) {
       assert(NewMIs.size() == 1);
       MachineInstr *NewMI = NewMIs.back();
+      MBB->insert(MII, NewMI);
       NewMIs.clear();
       int Idx = NewMI->findRegisterUseOperandIdx(VirtReg, false);
       assert(Idx != -1);
       SmallVector<unsigned, 1> Ops;
       Ops.push_back(Idx);
-      MachineInstr *FoldedMI = TII->foldMemoryOperand(MF, NewMI, Ops, SS);
+      MachineInstr *FoldedMI = TII->foldMemoryOperand(NewMI, Ops, SS);
+      NewMI->eraseFromParent();
       if (FoldedMI) {
         VRM->addSpillSlotUse(SS, FoldedMI);
         if (!VRM->hasPhys(UnfoldVR))
           VRM->assignVirt2Phys(UnfoldVR, UnfoldPR);
         VRM->virtFolded(VirtReg, FoldedMI, VirtRegMap::isRef);
-        MII = MBB->insert(MII, FoldedMI);
+        MII = FoldedMI;
         InvalidateKills(MI, TRI, RegKills, KillOps);
         VRM->RemoveMachineInstrFromMaps(&MI);
         MBB->erase(&MI);
-        MF.DeleteMachineInstr(NewMI);
         return true;
       }
-      MF.DeleteMachineInstr(NewMI);
     }
   }
 
@@ -1480,7 +1479,6 @@ CommuteToFoldReload(MachineBasicBlock::iterator &MII,
   if (MII == MBB->begin() || !MII->killsRegister(SrcReg))
     return false;
 
-  MachineFunction &MF = *MBB->getParent();
   MachineInstr &MI = *MII;
   MachineBasicBlock::iterator DefMII = prior(MII);
   MachineInstr *DefMI = DefMII;
@@ -1511,11 +1509,12 @@ CommuteToFoldReload(MachineBasicBlock::iterator &MII,
     MachineInstr *CommutedMI = TII->commuteInstruction(DefMI, true);
     if (!CommutedMI)
       return false;
+    MBB->insert(MII, CommutedMI);
     SmallVector<unsigned, 1> Ops;
     Ops.push_back(NewDstIdx);
-    MachineInstr *FoldedMI = TII->foldMemoryOperand(MF, CommutedMI, Ops, SS);
+    MachineInstr *FoldedMI = TII->foldMemoryOperand(CommutedMI, Ops, SS);
     // Not needed since foldMemoryOperand returns new MI.
-    MF.DeleteMachineInstr(CommutedMI);
+    CommutedMI->eraseFromParent();
     if (!FoldedMI)
       return false;
 
@@ -1528,7 +1527,7 @@ CommuteToFoldReload(MachineBasicBlock::iterator &MII,
     MachineInstr *StoreMI = MII;
     VRM->addSpillSlotUse(SS, StoreMI);
     VRM->virtFolded(VirtReg, StoreMI, VirtRegMap::isMod);
-    MII = MBB->insert(MII, FoldedMI);  // Update MII to backtrack.
+    MII = FoldedMI;  // Update MII to backtrack.
 
     // Delete all 3 old instructions.
     InvalidateKills(*ReloadMI, TRI, RegKills, KillOps);
@@ -1704,7 +1703,7 @@ bool LocalRewriter::InsertEmergencySpills(MachineInstr *MI) {
   std::vector<unsigned> &EmSpills = VRM->getEmergencySpills(MI);
   for (unsigned i = 0, e = EmSpills.size(); i != e; ++i) {
     unsigned PhysReg = EmSpills[i];
-    const TargetRegisterClass *RC = TRI->getPhysicalRegisterRegClass(PhysReg);
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
     assert(RC && "Unable to determine register class!");
     int SS = VRM->getEmergencySpillSlot(RC);
     if (UsedSS.count(SS))
@@ -1759,7 +1758,6 @@ bool LocalRewriter::InsertRestores(MachineInstr *MI,
     bool DoReMat = VRM->isReMaterialized(VirtReg);
     int SSorRMId = DoReMat
       ? VRM->getReMatId(VirtReg) : VRM->getStackSlot(VirtReg);
-    const TargetRegisterClass* RC = MRI->getRegClass(VirtReg);
     unsigned InReg = Spills.getSpillSlotOrReMatPhysReg(SSorRMId);
     if (InReg == Phys) {
       // If the value is already available in the expected register, save
@@ -1793,20 +1791,16 @@ bool LocalRewriter::InsertRestores(MachineInstr *MI,
       MachineBasicBlock::iterator InsertLoc =
         ComputeReloadLoc(MII, MBB->begin(), Phys, TRI, DoReMat, SSorRMId, TII,
                          *MBB->getParent());
-
-      TII->copyRegToReg(*MBB, InsertLoc, Phys, InReg, RC, RC,
-                        MI->getDebugLoc());
+      MachineInstr *CopyMI = BuildMI(*MBB, InsertLoc, MI->getDebugLoc(),
+                                     TII->get(TargetOpcode::COPY), Phys)
+                               .addReg(InReg, RegState::Kill);
 
       // This invalidates Phys.
       Spills.ClobberPhysReg(Phys);
       // Remember it's available.
       Spills.addAvailable(SSorRMId, Phys);
 
-      // Mark is killed.
-      MachineInstr *CopyMI = prior(InsertLoc);
       CopyMI->setAsmPrinterFlag(MachineInstr::ReloadReuse);
-      MachineOperand *KillOpnd = CopyMI->findRegisterUseOperand(InReg);
-      KillOpnd->setIsKill();
       UpdateKills(*CopyMI, TRI, RegKills, KillOps);
 
       DEBUG(dbgs() << '\t' << *CopyMI);
@@ -2013,7 +2007,7 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
       //       = EXTRACT_SUBREG fi#1
       // fi#1 is available in EDI, but it cannot be reused because it's not in
       // the right register file.
-      if (PhysReg && !AvoidReload && (SubIdx || MI.isExtractSubreg())) {
+      if (PhysReg && !AvoidReload && SubIdx) {
         const TargetRegisterClass* RC = MRI->getRegClass(VirtReg);
         if (!RC->contains(PhysReg))
           PhysReg = 0;
@@ -2034,6 +2028,18 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
           CanReuse = !ReusedOperands.isClobbered(PhysReg) &&
             Spills.canClobberPhysReg(PhysReg);
         }
+        // If this is an asm, and PhysReg is used elsewhere as an earlyclobber
+        // operand, we can't also use it as an input.  (Outputs always come
+        // before inputs, so we can stop looking at i.)
+        if (MI.isInlineAsm()) {
+          for (unsigned k=0; k<i; ++k) {
+            MachineOperand &MOk = MI.getOperand(k);
+            if (MOk.isReg() && MOk.getReg()==PhysReg && MOk.isEarlyClobber()) {
+              CanReuse = false;
+              break;
+            }
+          }
+        }
 
         if (CanReuse) {
           // If this stack slot value is already available, reuse it!
@@ -2104,6 +2110,8 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
         // To avoid this problem, and to avoid doing a load right after a store,
         // we emit a copy from PhysReg into the designated register for this
         // operand.
+        //
+        // This case also applies to an earlyclobber'd PhysReg.
         unsigned DesignatedReg = VRM->getPhys(VirtReg);
         assert(DesignatedReg && "Must map virtreg to physreg!");
 
@@ -2136,7 +2144,6 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
           continue;
         }
 
-        const TargetRegisterClass* RC = MRI->getRegClass(VirtReg);
         MRI->setPhysRegUsed(DesignatedReg);
         ReusedOperands.markClobbered(DesignatedReg);
 
@@ -2144,11 +2151,9 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
         MachineBasicBlock::iterator InsertLoc =
           ComputeReloadLoc(&MI, MBB->begin(), PhysReg, TRI, DoReMat,
                            SSorRMId, TII, MF);
-
-        TII->copyRegToReg(*MBB, InsertLoc, DesignatedReg, PhysReg, RC, RC,
-                          MI.getDebugLoc());
-
-        MachineInstr *CopyMI = prior(InsertLoc);
+        MachineInstr *CopyMI = BuildMI(*MBB, InsertLoc, MI.getDebugLoc(),
+                                       TII->get(TargetOpcode::COPY),
+                                       DesignatedReg).addReg(PhysReg);
         CopyMI->setAsmPrinterFlag(MachineInstr::ReloadReuse);
         UpdateKills(*CopyMI, TRI, RegKills, KillOps);
 
@@ -2269,27 +2274,16 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
           if (unsigned InReg = Spills.getSpillSlotOrReMatPhysReg(SS)) {
             DEBUG(dbgs() << "Promoted Load To Copy: " << MI);
             if (DestReg != InReg) {
-              const TargetRegisterClass *RC = MRI->getRegClass(VirtReg);
-              TII->copyRegToReg(*MBB, &MI, DestReg, InReg, RC, RC,
-                                MI.getDebugLoc());
               MachineOperand *DefMO = MI.findRegisterDefOperand(DestReg);
-              unsigned SubIdx = DefMO->getSubReg();
+              MachineInstr *CopyMI = BuildMI(*MBB, &MI, MI.getDebugLoc(),
+                                             TII->get(TargetOpcode::COPY))
+                .addReg(DestReg, RegState::Define, DefMO->getSubReg())
+                .addReg(InReg, RegState::Kill);
               // Revisit the copy so we make sure to notice the effects of the
               // operation on the destreg (either needing to RA it if it's
               // virtual or needing to clobber any values if it's physical).
-              NextMII = &MI;
-              --NextMII;  // backtrack to the copy.
+              NextMII = CopyMI;
               NextMII->setAsmPrinterFlag(MachineInstr::ReloadReuse);
-              // Propagate the sub-register index over.
-              if (SubIdx) {
-                DefMO = NextMII->findRegisterDefOperand(DestReg);
-                DefMO->setSubReg(SubIdx);
-              }
-
-              // Mark is killed.
-              MachineOperand *KillOpnd = NextMII->findRegisterUseOperand(InReg);
-              KillOpnd->setIsKill();
-
               BackTracked = true;
             } else {
               DEBUG(dbgs() << "Removing now-noop copy: " << MI);
@@ -2430,6 +2424,24 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
         // Also check if it's copying from an "undef", if so, we can't
         // eliminate this or else the undef marker is lost and it will
         // confuses the scavenger. This is extremely rare.
+        if (MI.isIdentityCopy() && !MI.getOperand(1).isUndef() &&
+            MI.getNumOperands() == 2) {
+          ++NumDCE;
+          DEBUG(dbgs() << "Removing now-noop copy: " << MI);
+          SmallVector<unsigned, 2> KillRegs;
+          InvalidateKills(MI, TRI, RegKills, KillOps, &KillRegs);
+          if (MO.isDead() && !KillRegs.empty()) {
+            // Source register or an implicit super/sub-register use is killed.
+            assert(TRI->regsOverlap(KillRegs[0], MI.getOperand(0).getReg()));
+            // Last def is now dead.
+            TransferDeadness(MI.getOperand(1).getReg(), RegKills, KillOps);
+          }
+          VRM->RemoveMachineInstrFromMaps(&MI);
+          MBB->erase(&MI);
+          Erased = true;
+          Spills.disallowClobberPhysReg(VirtReg);
+          goto ProcessNextInst;
+        }
         unsigned Src, Dst, SrcSR, DstSR;
         if (TII->isMoveInstr(MI, Src, Dst, SrcSR, DstSR) &&
             Src == Dst && SrcSR == DstSR &&
@@ -2519,6 +2531,16 @@ LocalRewriter::RewriteMBB(LiveIntervals *LIs,
 
         // Check to see if this is a noop copy.  If so, eliminate the
         // instruction before considering the dest reg to be changed.
+        if (MI.isIdentityCopy()) {
+          ++NumDCE;
+          DEBUG(dbgs() << "Removing now-noop copy: " << MI);
+          InvalidateKills(MI, TRI, RegKills, KillOps);
+          VRM->RemoveMachineInstrFromMaps(&MI);
+          MBB->erase(&MI);
+          Erased = true;
+          UpdateKills(*LastStore, TRI, RegKills, KillOps);
+          goto ProcessNextInst;
+        }
         {
           unsigned Src, Dst, SrcSR, DstSR;
           if (TII->isMoveInstr(MI, Src, Dst, SrcSR, DstSR) &&
-- 
cgit v1.2.3